In [None]:
import airbnb_scraper.pyairbnb as pyairbnb
import pandas as pd
import time
import re
import csv

input_csv = r"C:\Users\fraxi\OneDrive\Desktop\code task\Amsterdam\transaction_pairs.csv"
output_csv = "B(k,·)_host_reviews_written.csv"
language = "en"
proxy_url = None  
delay_seconds = 1 

In [None]:

df = pd.read_csv(input_csv)
listing_ids = df["listing_id"].dropna().astype(str).unique().tolist()
print(f"🔍 共获取 {len(listing_ids)} 个唯一 listing_id")

results = []

for idx, listing_id in enumerate(listing_ids, 1):
    room_url = f"https://www.airbnb.com/rooms/{listing_id}"
    print(f"\n🔹[{idx}/{len(listing_ids)}] 正在爬取房源 {listing_id}...")

    try:
        reviews = pyairbnb.get_reviews(room_url, language, proxy_url)
        for review in reviews:
            response = review.get("response")

           
            if not response and review.get("comments"):
                match = re.search(r"<br\s*/?>\s*(?:host\s*)?response\s*:?\s*(.*)", review["comments"], flags=re.IGNORECASE)
                if match:
                    response = match.group(1).strip()

            if response:
                results.append({
                    "review_id": review.get("id"),
                    "listing_id": listing_id,
                    "host_id": review.get("reviewee", {}).get("id"),
                    "created_at": review.get("createdAt"),
                    "response": response
                })

    except Exception as e:
        print(f"failed: {e}")


    time.sleep(delay_seconds)


if results:
    with open(output_csv, "w", newline='', encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=["review_id", "listing_id", "host_id", "created_at", "response"])
        writer.writeheader()
        writer.writerows(results)
    print(f" A total of {len(results)} reviews with host responses have been extracted and saved to: {output_csv}")
else:
    print(" No reviews contain host responses.")


🔍 共获取 932 个唯一 listing_id

🔹[1/932] 正在爬取房源 27886...



🔹[2/932] 正在爬取房源 28871...

🔹[3/932] 正在爬取房源 29051...

🔹[4/932] 正在爬取房源 49552...

🔹[5/932] 正在爬取房源 50263...

🔹[6/932] 正在爬取房源 50515...

🔹[7/932] 正在爬取房源 50523...

🔹[8/932] 正在爬取房源 55709...

🔹[9/932] 正在爬取房源 55868...

🔹[10/932] 正在爬取房源 62015...

🔹[11/932] 正在爬取房源 80635...

🔹[12/932] 正在爬取房源 91535...

🔹[13/932] 正在爬取房源 137026...

🔹[14/932] 正在爬取房源 158633...

🔹[15/932] 正在爬取房源 168769...

🔹[16/932] 正在爬取房源 179528...

🔹[17/932] 正在爬取房源 188347...

🔹[18/932] 正在爬取房源 189754...

🔹[19/932] 正在爬取房源 190943...

🔹[20/932] 正在爬取房源 213371...

🔹[21/932] 正在爬取房源 214531...

🔹[22/932] 正在爬取房源 245927...

🔹[23/932] 正在爬取房源 247805...

🔹[24/932] 正在爬取房源 252080...

🔹[25/932] 正在爬取房源 254104...

🔹[26/932] 正在爬取房源 254800...

🔹[27/932] 正在爬取房源 255809...

🔹[28/932] 正在爬取房源 274706...

🔹[29/932] 正在爬取房源 283170...

🔹[30/932] 正在爬取房源 286500...

🔹[31/932] 正在爬取房源 290701...

🔹[32/932] 正在爬取房源 304082...

🔹[33/932] 正在爬取房源 306852...

🔹[34/932] 正在爬取房源 307497...

🔹[35/932] 正在爬取房源 307621...

🔹[36/932] 正在爬取房源 308028...

🔹[37/932] 正在爬取房源 311124...

🔹[38/932] 

In [None]:
import pandas as pd

# === Step 1: Load the original host review data ===
# Replace with your local or actual file path
df = pd.read_csv("B(k,·)_host_reviews_written.csv")

# The file should contain the following fields:
# ['review_id', 'listing_id', 'host_id', 'created_at', 'response']

# === Step 2: Remove blank or invalid responses ===
df_clean = df.dropna(subset=['response'])  # Remove NaN
df_clean = df_clean[df_clean['response'].str.strip().astype(bool)]  # Remove empty strings

# Remove duplicate templates (same host_id writing the same response multiple times)
df_clean = df_clean.drop_duplicates(subset=['host_id', 'response'])

# Remove responses with fewer than five tokens
df_clean = df_clean[df_clean['response'].str.split().str.len() >= 5]

# === Step 3: Aggregate all response texts by host_id ===
bk_docs = (
    df_clean
    .groupby('host_id')['response']
    .apply(lambda texts: ' '.join(str(t) for t in texts))  # Concatenate into a single long document
    .reset_index()
    .rename(columns={'response': 'doc_text'})  # Output as host_id, doc_text
)

# === Step 4 (Optional): Save as a CSV file for modeling ===
bk_docs.to_csv("B_k_host_review_docs.csv", index=False, encoding='utf-8')

# === Step 5 (Optional): Print the first few rows for inspection ===
print(bk_docs.head())


   host_id                                           doc_text
0    97647  p Enjoy the Rhine! No recommend? Are we a vict...
1    98297  Thank you for your review and your stay  Sean!...
2    98647  Thanks for your kind words Hi Faisal,<br/>Than...
3    98844  Thank you for your review and your kind words ...
4   124245  ❤️❤️❤️❤️ Thank you so much ❤️❤️❤️ ❤️❤️❤️ Thank...
