In [51]:
import json
import pandas as pd
from datetime import datetime
import time
from tqdm import tqdm
import airbnb_scraper.pyairbnb as pyairbnb

In [None]:
def batch_scrape(guest_ids, api_key, cookies, proxy_url=None, delay=0.5):
    basic_info_list = []
    host_reviews_list = []
    guest_reviews_list = []
    
    for gid in tqdm(guest_ids, desc="Scraping guests"):
        try:
            data = pyairbnb.guest_details.get(api_key, cookies, gid, proxy_url)
            user_profile = data["data"]["presentation"]["userProfileContainer"]["userProfile"]
            if not user_profile:
                continue

            # ===== Basic Information =====
            basic_info = {
                "guest_id": gid,
                "smartName": user_profile.get("smartName"),
                "displayFirstName": user_profile.get("displayFirstName"),
                "about": user_profile.get("about"),
                "createdAt": user_profile.get("createdAt"),
                "join_year": datetime.fromisoformat(user_profile["createdAt"]).year if user_profile.get("createdAt") else None,
                "join_month": datetime.fromisoformat(user_profile["createdAt"]).month if user_profile.get("createdAt") else None,
                "isSuperhost": user_profile.get("isSuperhost"),
                "isHomeHost": user_profile.get("isHomeHost"),
                "guestType": user_profile.get("guestType"),
                "facebookConnected": user_profile.get("facebookConnected"),
                "hasProfilePicture": user_profile.get("hasProfilePicture"),
                "profilePictureUrl": user_profile.get("profilePictureUrl"),
                "identityVerified": user_profile.get("hasIdentityBadge"),
                "verifiedTypes": ",".join(user_profile.get("identityVerificationTypes", [])),
                "languages": ",".join([lang["name"] for lang in user_profile.get("i18nSpokenLanguages", [])]),
                "interests": ",".join([i["title"] for i in user_profile.get("allInterestsList", [])]),
                "years_on_platform": user_profile.get("timeAsUser", {}).get("years"),
                "months_on_platform": user_profile.get("timeAsUser", {}).get("months"),
            }
            basic_info_list.append(basic_info)

            # ===== Host Reviews (Guest as Host) =====
            for r in user_profile.get("reviewsReceivedFromGuests", {}).get("reviews", []):
                host_reviews_list.append({
                    "guest_id": gid,
                    # Main fields
                    "id": r.get("id"),
                    "createdAt": r.get("createdAt"),
                    "comments": r.get("comments"),
                    "entityType": r.get("entityType"),
                    "rating": r.get("rating"),
                    "response": r.get("response"),
                    "respondedAt": r.get("respondedAt"),
                    "isHostHighlight": r.get("isHostHighlight"),
                    "listing": r.get("listing"),
                    "photos": r.get("photos"),

                    # Reviewee (current guest)
                    "reviewee.id": r.get("reviewee", {}).get("id"),
                    "reviewee.smartName": r.get("reviewee", {}).get("smartName"),
                    "reviewee.pictureUrl": r.get("reviewee", {}).get("pictureUrl"),
                    "reviewee.createdAt": r.get("reviewee", {}).get("createdAt"),
                    "reviewee.isSuperhost": r.get("reviewee", {}).get("isSuperhost"),

                    # Reviewer (guest)
                    "reviewer.id": r.get("reviewer", {}).get("id"),
                    "reviewer.smartName": r.get("reviewer", {}).get("smartName"),
                    "reviewer.pictureUrl": r.get("reviewer", {}).get("pictureUrl"),
                    "reviewer.location": r.get("reviewer", {}).get("location"),
                    "reviewer.createdAt": r.get("reviewer", {}).get("createdAt"),
                    "reviewer.isSuperhost": r.get("reviewer", {}).get("isSuperhost"),

                    # Translation fields
                    "translation.comments": r.get("translation", {}).get("comments"),
                    "translation.commentsLanguage": r.get("translation", {}).get("commentsLanguage"),
                    "translation.response": r.get("translation", {}).get("response"),
                    "translation.responseLanguage": r.get("translation", {}).get("responseLanguage"),
                })

            # ===== Guest Reviews (Guest as Guest) =====
            for r in user_profile.get("reviewsReceivedFromHosts", {}).get("reviews", []):
                guest_reviews_list.append({
                    "guest_id": gid,
                    # Main fields
                    "id": r.get("id"),
                    "createdAt": r.get("createdAt"),
                    "comments": r.get("comments"),
                    "entityType": r.get("entityType"),
                    "rating": r.get("rating"),
                    "response": r.get("response"),
                    "respondedAt": r.get("respondedAt"),
                    "isHostHighlight": r.get("isHostHighlight"),
                    "listing": r.get("listing"),
                    "photos": r.get("photos"),

                    # Reviewee (guest themselves)
                    "reviewee.id": r.get("reviewee", {}).get("id"),
                    "reviewee.smartName": r.get("reviewee", {}).get("smartName"),
                    "reviewee.pictureUrl": r.get("reviewee", {}).get("pictureUrl"),
                    "reviewee.createdAt": r.get("reviewee", {}).get("createdAt"),
                    "reviewee.isSuperhost": r.get("reviewee", {}).get("isSuperhost"),

                    # Reviewer (host)
                    "reviewer.id": r.get("reviewer", {}).get("id"),
                    "reviewer.smartName": r.get("reviewer", {}).get("smartName"),
                    "reviewer.pictureUrl": r.get("reviewer", {}).get("pictureUrl"),
                    "reviewer.location": r.get("reviewer", {}).get("location"),
                    "reviewer.createdAt": r.get("reviewer", {}).get("createdAt"),
                    "reviewer.isSuperhost": r.get("reviewer", {}).get("isSuperhost"),

                    # Translation fields
                    "translation.comments": r.get("translation", {}).get("comments"),
                    "translation.commentsLanguage": r.get("translation", {}).get("commentsLanguage"),
                    "translation.response": r.get("translation", {}).get("response"),
                    "translation.responseLanguage": r.get("translation", {}).get("responseLanguage"),
                })

        except Exception as e:
            print(f"❌ Error for guest_id {gid}: {e}")
        time.sleep(delay)

    # ===== Save CSV Files =====
    pd.DataFrame(basic_info_list).to_csv(r"C:\Users\fraxi\OneDrive\Desktop\code task\Amsterdam\open_guest_basic_info.csv", index=False, encoding="utf-8-sig")
    pd.DataFrame(host_reviews_list).to_csv(r"C:\Users\fraxi\OneDrive\Desktop\code task\Amsterdam\open_guest_reviews_by_guests.csv", index=False, encoding="utf-8-sig")
    pd.DataFrame(guest_reviews_list).to_csv(r"C:\Users\fraxi\OneDrive\Desktop\code task\Amsterdam\open_guest_reviews_by_hosts.csv", index=False, encoding="utf-8-sig")
    print("✅ All files have been saved!")


In [None]:
df = pd.read_csv(r"C:\Users\fraxi\OneDrive\Desktop\code task\Amsterdam\transaction_pairs.csv")

guest_ids = df['guest_id'].dropna().astype(str).drop_duplicates().head(50).tolist()


In [54]:
proxy_url=None
api_key = pyairbnb.get_api_key(proxy_url)
cookies = {}
results = []

In [55]:

batch_scrape(guest_ids, api_key, cookies)

Scraping guests:   6%|▌         | 3/50 [00:00<00:15,  3.00it/s]

❌ Error for guest_id 17290755: 'NoneType' object has no attribute 'get'


Scraping guests:  20%|██        | 10/50 [00:03<00:13,  3.01it/s]

❌ Error for guest_id 4461304: 'NoneType' object has no attribute 'get'


Scraping guests:  38%|███▊      | 19/50 [00:07<00:10,  3.07it/s]

❌ Error for guest_id 428669771: 'NoneType' object has no attribute 'get'


Scraping guests:  58%|█████▊    | 29/50 [00:10<00:06,  3.13it/s]

❌ Error for guest_id 54440846: 'NoneType' object has no attribute 'get'


Scraping guests:  64%|██████▍   | 32/50 [00:12<00:07,  2.54it/s]

❌ Error for guest_id 225915373: 'NoneType' object has no attribute 'get'


Scraping guests:  68%|██████▊   | 34/50 [00:13<00:07,  2.21it/s]

❌ Error for guest_id 11043978: 'NoneType' object has no attribute 'get'


Scraping guests:  80%|████████  | 40/50 [00:16<00:03,  2.61it/s]

❌ Error for guest_id 38161415: 'NoneType' object has no attribute 'get'


Scraping guests: 100%|██████████| 50/50 [00:19<00:00,  2.50it/s]

✅ 所有文件已保存！



