In [1]:
pip install google-play-scraper


Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7


In [2]:
from google_play_scraper import reviews
import pandas as pd
import time

In [3]:
import pandas as pd
import time
from google_play_scraper import reviews, Sort

def fetch_reviews(app_id, lang='vi', country='vn', total=10000, batch=200):
    """Crawl review từ Google Play Store"""
    data, token, collected = [], None, 0

    print(f"[+] Crawling {total} reviews from {app_id}...")

    while collected < total:
        try:
            result, token = reviews(
                app_id,
                lang=lang,
                country=country,
                count=batch,
                sort=Sort.NEWEST,
                continuation_token=token
            )
            if not result:
                print("[!] No more data available.")
                break

            data.extend(result)
            collected += len(result)
            print(f"[✓] Collected: {collected}/{total}")
            time.sleep(1.5)

        except Exception as e:
            print(f"[x] Error during crawling: {e}")
            break

    return pd.DataFrame(data)

def preprocess(df, app_name):
    """Làm sạch và lọc dữ liệu theo 2 bản cập nhật phổ biến nhất"""
    cols = ['userName', 'score', 'content', 'at', 'reviewCreatedVersion', 'thumbsUpCount']
    df = df[cols].copy()
    df.columns = ['user_name', 'score', 'review_text', 'review_date', 'app_version', 'thumbs_up']

    # Loại bỏ dòng thiếu version
    df = df[df['app_version'].notna()]

    # Giữ lại 2 phiên bản phổ biến nhất
    top_versions = df['app_version'].value_counts().nlargest(2).index.tolist()
    df = df[df['app_version'].isin(top_versions)]

    # Gán nhãn churn
    df['churned'] = df['score'].apply(lambda s: 1 if s <= 2 else 0)
    df['app'] = app_name

    return df

def main():
    app_id = "com.garena.game.kgvn"
    app_name = "Lien Quan Mobile"
    raw_df = fetch_reviews(app_id, total=10000)

    if raw_df.empty:
        print("[!] Không thu được dữ liệu.")
        return

    clean_df = preprocess(raw_df, app_name)

    output_file = f"{app_name.lower().replace(' ', '_')}_filtered_latest_versions.csv"
    clean_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"[✓] Saved to: {output_file} ({len(clean_df)} records)")

if __name__ == "__main__":
    main()


[+] Crawling 10000 reviews from com.garena.game.kgvn...
[✓] Collected: 200/10000
[✓] Collected: 400/10000
[✓] Collected: 600/10000
[✓] Collected: 800/10000
[✓] Collected: 1000/10000
[✓] Collected: 1200/10000
[✓] Collected: 1400/10000
[✓] Collected: 1600/10000
[✓] Collected: 1800/10000
[✓] Collected: 2000/10000
[✓] Collected: 2200/10000
[✓] Collected: 2400/10000
[✓] Collected: 2600/10000
[✓] Collected: 2800/10000
[✓] Collected: 3000/10000
[✓] Collected: 3200/10000
[✓] Collected: 3400/10000
[✓] Collected: 3600/10000
[✓] Collected: 3800/10000
[✓] Collected: 4000/10000
[✓] Collected: 4200/10000
[✓] Collected: 4400/10000
[✓] Collected: 4600/10000
[✓] Collected: 4800/10000
[✓] Collected: 5000/10000
[✓] Collected: 5200/10000
[✓] Collected: 5400/10000
[✓] Collected: 5600/10000
[✓] Collected: 5800/10000
[✓] Collected: 6000/10000
[✓] Collected: 6200/10000
[✓] Collected: 6400/10000
[✓] Collected: 6600/10000
[✓] Collected: 6800/10000
[✓] Collected: 7000/10000
[✓] Collected: 7200/10000
[✓] Collecte