In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Rakuten Travel レビュー収集 2024（県別 70 件 / v12.1 — f‑string 修正）
──────────────────────────────────────────────
◎ 県ごとに 70 件を目標に 2024 年レビューを収集
◎ 1 ホテル最大 5 件 / 30 ホテルずつ並列処理（不足時に追加）
◎ WORKERS = RT_MAX_WORKERS（デフォルト 24）
"""

import os, re, time, random
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from collections import defaultdict

import requests
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
import pandas as pd

# ─────────── 設定 ───────────
MAX_PER_PREF   = 70
MAX_REV_HOTEL  = 5
TARGET_YEAR    = "2024年"
MONTH_CODES    = [f"2024{m:02d}" for m in range(1, 13)]
MAX_WORKERS    = int(os.getenv("RT_MAX_WORKERS", 24))
SLEEP_RANGE    = (0.05, 0.12)
OUT_FILE       = "japan_2024_reviews.csv"
UA             = {"User-Agent": "Mozilla/5.0"}
REV_PAGE_SIZE  = 20
MAX_PAGES_PER_M= 4    # 4*20 = 80 レビュー / ホテル / 月
MAX_HOTELS_PER_BATCH = 25  # 25*5 = 125 > 70 → 十分
PAT_ID         = re.compile(r"/(?:HOTEL|hotelinfo)/(\d+)/", re.I)

PREF_SLUGS = [
    "hokkaido","aomori","iwate","miyagi","akita","yamagata","fukushima",
    "ibaraki","tochigi","gunma","saitama","chiba","tokyo","kanagawa",
    "niigata","toyama","ishikawa","fukui","yamanashi","nagano","gifu",
    "shizuoka","aichi","mie","shiga","kyoto","osaka","hyogo","nara",
    "wakayama","tottori","shimane","okayama","hiroshima","yamaguchi",
    "tokushima","kagawa","ehime","kochi","fukuoka","saga","nagasaki",
    "kumamoto","oita","miyazaki","kagoshima","okinawa"
]

LIST_UI       = "https://travel.rakuten.co.jp/yado/{slug}/?page={page}"
LIST_FALLBACK = "https://travel.rakuten.co.jp/office/pref/{slug}.html?p={page}"

# ─────────── HTTP セッション ───────────
_session = requests.Session()
_session.mount("https://", HTTPAdapter(pool_maxsize=MAX_WORKERS*2))
_session.headers.update(UA)


def http_get(url: str, **kw):
    """GET with up to 3 retries."""
    for _ in range(3):
        try:
            r = _session.get(url, timeout=8, **kw)
            if r.status_code == 200:
                return r
        except requests.RequestException:
            pass
        time.sleep(0.8)
    return None

# ─────────── ホテル一覧 ───────────

def fetch_ids(base_url: str, slug: str, max_pages: int = 100, patience: int = 3):
    ids = set(); stale = 0
    for p in range(1, max_pages + 1):
        res = http_get(base_url.format(slug=slug, page=p))
        if not res:
            break
        before = len(ids)
        ids.update(PAT_ID.findall(res.text))
        stale = stale + 1 if len(ids) == before else 0
        if stale >= patience:
            break
        time.sleep(random.uniform(*SLEEP_RANGE))
    return ids

# ─────────── レビュー ───────────
LINK_SEL = "p.reviewTitle a, h2.commentTitle a, div.rvTtl a, div.rvTltl a"

def fetch_reviews(hid: str):
    reviews, seen = [], set()
    for ym in MONTH_CODES:
        for pg in range(MAX_PAGES_PER_M):
            res = http_get(
                f"https://review.travel.rakuten.co.jp/hotel/voice/{hid}/",
                params={"f_time": ym, "f_sort": "0", "f_next": str(pg * REV_PAGE_SIZE)}
            )
            if not res:
                break
            soup = BeautifulSoup(res.text, "html.parser")
            for a in soup.select(LINK_SEL):
                href = a.get("href", "")
                if href in seen:
                    continue
                seen.add(href)
                det = http_get(href if href.startswith("http") else "https://review.travel.rakuten.co.jp" + href)
                if not det:
                    continue
                body = BeautifulSoup(det.text, "html.parser").select_one("p.commentSentence")
                if not body or TARGET_YEAR not in det.text:
                    continue
                reviews.append(body.get_text(strip=True))
                if len(reviews) >= MAX_REV_HOTEL:
                    return reviews
            time.sleep(random.uniform(*SLEEP_RANGE))
    return reviews

# ─────────── メイン ───────────

def main():
    start = time.time()
    rows = []

    for slug in PREF_SLUGS:
        print(f"🌏 {slug}")
        ids = list(fetch_ids(LIST_UI, slug))
        if len(ids) < 150:
            ids = list(set(ids) | fetch_ids(LIST_FALLBACK, slug, 60, 3))
        random.shuffle(ids)

        need = MAX_PER_PREF
        idx = 0
        with ThreadPoolExecutor(MAX_WORKERS) as pool:
            while need > 0 and idx < len(ids):
                batch_ids = ids[idx: idx + MAX_HOTELS_PER_BATCH]
                idx += MAX_HOTELS_PER_BATCH
                futures = {pool.submit(fetch_reviews, hid): hid for hid in batch_ids}

                for fut in as_completed(futures):
                    revs = fut.result()
                    hid = futures[fut]
                    take = min(len(revs), need)
                    for txt in revs[:take]:
                        rows.append({
                            "hotel_id": hid,
                            "prefecture": slug,
                            "review_text": txt
                        })
                    need -= take
                    if need <= 0:
                        # cancel all remaining futures in current batch
                        for f in futures:
                            f.cancel()
                        break
            print(f"   ✔ collected {MAX_PER_PREF - need}/{MAX_PER_PREF}")

    # 保存
    pd.DataFrame(rows).to_csv(OUT_FILE, index=False, encoding="utf-8-sig")
    duration = (time.time() - start) / 60
    print(f"\n完了 {len(rows)} 件 / {duration:.1f} 分  (WORKERS={MAX_WORKERS})")

if __name__ == "__main__":
    main()
