In [10]:
from google_play_scraper import reviews, Sort
import pandas as pd
import os, time, datetime as dt

APP_ID = "id.kriptomaksima.app"   # Floq
LANG   = "id"
COUNTRY= "id"
BATCH  = 200                      # per page
MAX_PAGES = 200                   # guardrail
RAW_DIR = "data/raw"
MASTER_PATH = "data/master/floq_reviews_master.csv"

os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(os.path.dirname(MASTER_PATH), exist_ok=True)

# 1) Muat master lama kalau ada, siapkan set reviewId utk early-stop
if os.path.exists(MASTER_PATH):
    master = pd.read_csv(MASTER_PATH)
    seen = set(master.get("reviewId", []).astype(str))
else:
    master = pd.DataFrame()
    seen = set()

# 2) Loop incremental: ambil yang terbaru dulu, stop jika ketemu review lama
all_new = []
continuation_token = None
pages = 0

print(f"[{dt.datetime.now()}] Start incremental fetch for {APP_ID}")
while True:
    try:
        batch, continuation_token = reviews(
            APP_ID,
            lang=LANG,
            country=COUNTRY,
            sort=Sort.NEWEST,         # penting: terbaru dulu
            count=BATCH,
            continuation_token=continuation_token
        )
    except Exception as e:
        print("Fetch error, retry 3s:", e)
        time.sleep(3)
        continue

    pages += 1
    if not batch:
        print("Empty batch, stop.")
        break

    # Early-stop: kumpulkan hanya yang belum pernah ada
    fresh = [r for r in batch if str(r.get("reviewId")) not in seen]
    all_new.extend(fresh)

    print(f"Page {pages}: got {len(batch)} | new {len(fresh)} | total_new {len(all_new)}")

    # Jika satu batch penuh ternyata semua sudah pernah ada, kemungkinan kita sudah melewati semua ulasan baru.
    if len(fresh) == 0:
        print("No new reviews in this page -> early stop.")
        break

    if continuation_token is None or pages >= MAX_PAGES:
        print("No more pages or hit MAX_PAGES.")
        break

    # sopan sedikit ke server
    time.sleep(0.5)

# 3) Simpan snapshot raw harian (opsional tapi bagus untuk audit)
today = dt.datetime.now().strftime("%Y%m%d")
raw_path = os.path.join(RAW_DIR, f"floq_reviews_{today}.csv")

if all_new:
    df_new = pd.DataFrame(all_new)
    df_new.to_csv(raw_path, index=False)
    print(f"Saved RAW snapshot: {raw_path} ({len(df_new)} rows)")
else:
    df_new = pd.DataFrame()
    print("No NEW data today. RAW snapshot not created.")

# 4) Merge ke master + dedup per reviewId
if len(df_new) > 0:
    master = pd.concat([master, df_new], ignore_index=True)
    if "reviewId" in master.columns:
        before = len(master)
        master.drop_duplicates(subset=["reviewId"], keep="first", inplace=True)
        after = len(master)
        print(f"Dedup master: {before} -> {after}")
    master.to_csv(MASTER_PATH, index=False)
    print(f"MASTER updated: {MASTER_PATH} (rows={len(master)})")
else:
    print("MASTER unchanged (no new rows).")

# 5) Tampilkan ringkas
if len(master) > 0:
    cols = ["reviewId","score","at","userName","content"]
    display(master[ [c for c in cols if c in master.columns] ].tail(10))


[2025-10-09 17:30:37.480043] Start incremental fetch for id.kriptomaksima.app
Page 1: got 200 | new 200 | total_new 200
Page 2: got 200 | new 200 | total_new 400
Page 3: got 200 | new 200 | total_new 600
Page 4: got 200 | new 200 | total_new 800
Page 5: got 200 | new 200 | total_new 1000
Page 6: got 200 | new 200 | total_new 1200
Page 7: got 200 | new 200 | total_new 1400
Page 8: got 200 | new 200 | total_new 1600
Page 9: got 200 | new 200 | total_new 1800
Page 10: got 200 | new 200 | total_new 2000
Page 11: got 200 | new 200 | total_new 2200
Page 12: got 200 | new 200 | total_new 2400
Page 13: got 200 | new 200 | total_new 2600
Page 14: got 200 | new 200 | total_new 2800
Page 15: got 200 | new 200 | total_new 3000
Page 16: got 200 | new 200 | total_new 3200
Page 17: got 200 | new 200 | total_new 3400
Page 18: got 200 | new 200 | total_new 3600
Page 19: got 200 | new 200 | total_new 3800
Page 20: got 200 | new 200 | total_new 4000
Page 21: got 200 | new 200 | total_new 4200
Page 22: go

Unnamed: 0,reviewId,score,at,userName,content
8960,782c7a35-490a-4712-ba21-1de294ed1b13,5,2025-05-27 09:39:29,NFC GOLDENS,keren
8961,11349cc3-db2f-43bb-bb15-6dc5745e3def,5,2025-05-27 09:37:59,Wayan Radea,baguss
8962,89dd505b-533a-4d7e-8d2a-3a1c1bf11a2a,5,2025-05-27 09:37:40,Had Hary,Kita coba
8963,37a99f29-3abf-47c4-92af-e2113b232909,5,2025-05-27 09:37:39,Ilyas Ku,aku orang pertama yang instal aplikasi ini sem...
8964,9cbbd8ca-b0eb-4f99-a3c3-87dafef2b41a,5,2025-05-27 09:37:31,Anonym0 7,The exchange is very good
8965,e7cb5dea-c162-4f56-ac79-a11d0af81f9b,5,2025-05-27 09:36:56,Riski Dwi,untuk sekarang bagus cuman ga tau kedepannya
8966,9c11f4e0-b6ec-4881-9602-a88bb814e47d,5,2025-05-27 09:36:47,Ahmad Maulana,"baru mau pakai sih,tapi sudah sangat saya perc..."
8967,1c69c031-03ad-4f21-8791-909890fa6e55,5,2025-05-27 09:36:21,Pandu,bagus
8968,91950410-5ce1-4b63-a5f6-1f9af7ed872a,3,2025-05-27 09:35:01,Rajana Sagala,"saya ga bisa verifikasi, padahal kode otp nya ..."
8969,46c8b351-31cf-4469-b554-a4c76a0c1dc0,5,2025-05-27 09:28:50,Mohammad Fatah RIS,sebenarnya aplikasi buatan anak negeri emang b...
