In [2]:
from google_play_scraper import Sort
from google_play_scraper.constants.element import ElementSpecs
from google_play_scraper.constants.regex import Regex
from google_play_scraper.constants.request import Formats
from google_play_scraper.utils.request import post

import pandas as pd
from datetime import datetime
import time
import json
from time import sleep
from typing import List, Optional, Tuple
from tqdm import tqdm
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
MAX_COUNT_EACH_FETCH = 199


class _ContinuationToken:
    __slots__ = (
        "token",
        "lang",
        "country",
        "sort",
        "count",
        "filter_score_with",
        "filter_device_with",
    )

    def __init__(
        self, token, lang, country, sort, count, filter_score_with, filter_device_with
    ):
        self.token = token
        self.lang = lang
        self.country = country
        self.sort = sort
        self.count = count
        self.filter_score_with = filter_score_with
        self.filter_device_with = filter_device_with


def _fetch_review_items(
    url: str,
    app_id: str,
    sort: int,
    count: int,
    filter_score_with: Optional[int],
    filter_device_with: Optional[int],
    pagination_token: Optional[str],
):
    dom = post(
        url,
        Formats.Reviews.build_body(
            app_id,
            sort,
            count,
            "null" if filter_score_with is None else filter_score_with,
            "null" if filter_device_with is None else filter_device_with,
            pagination_token,
        ),
        {"content-type": "application/x-www-form-urlencoded"},
    )
    match = json.loads(Regex.REVIEWS.findall(dom)[0])

    return json.loads(match[0][2])[0], json.loads(match[0][2])[-2][-1]


def reviews(
    app_id: str,
    lang: str = "en",
    country: str = "us",
    sort: Sort = Sort.MOST_RELEVANT,
    count: int = 100,
    filter_score_with: int = None,
    filter_device_with: int = None,
    continuation_token: _ContinuationToken = None,
) -> Tuple[List[dict], _ContinuationToken]:
    sort = sort.value

    if continuation_token is not None:
        token = continuation_token.token

        if token is None:
            return (
                [],
                continuation_token,
            )

        lang = continuation_token.lang
        country = continuation_token.country
        sort = continuation_token.sort
        count = continuation_token.count
        filter_score_with = continuation_token.filter_score_with
        filter_device_with = continuation_token.filter_device_with
    else:
        token = None

    url = Formats.Reviews.build(lang=lang, country=country)

    _fetch_count = count

    result = []

    while True:
        if _fetch_count == 0:
            break

        if _fetch_count > MAX_COUNT_EACH_FETCH:
            _fetch_count = MAX_COUNT_EACH_FETCH

        try:
            review_items, token = _fetch_review_items(
                url,
                app_id,
                sort,
                _fetch_count,
                filter_score_with,
                filter_device_with,
                token,
            )
        except (TypeError, IndexError):
            if continuation_token and hasattr(continuation_token, 'token'):
                token = continuation_token.token
            else:
                token = None
            continue

        for review in review_items:
            result.append(
                {
                    k: spec.extract_content(review)
                    for k, spec in ElementSpecs.Review.items()
                }
            )

        _fetch_count = count - len(result)

        if isinstance(token, list):
            token = None
            break

    return (
        result,
        _ContinuationToken(
            token, lang, country, sort, count, filter_score_with, filter_device_with
        )
    )


def reviews_all(app_id: str, sleep_milliseconds: int = 0, **kwargs) -> list:
    kwargs.pop("count", None)
    kwargs.pop("continuation_token", None)

    continuation_token = None

    result = []

    while True:
        _result, continuation_token = reviews(
            app_id,
            count=MAX_COUNT_EACH_FETCH,
            continuation_token=continuation_token,
            **kwargs
        )

        result += _result

        if continuation_token.token is None:
            break

        if sleep_milliseconds:
            sleep(sleep_milliseconds / 1000)

    return result

In [4]:
# Konfigurasi
output_folder = r"D:\Code\Data Mining\Kuliah Data Mining\Praktikum10"
app_id = "com.tokopedia.tkpd"  # ID aplikasi Tokopedia di Play Store
reviews_count = 25000          # Jumlah review yang ingin diambil

In [None]:
# Main scraping code
result = []
continuation_token = None

with tqdm(total=reviews_count, position=0, leave=True) as pbar:
    while len(result) < reviews_count:
        new_result, continuation_token = reviews(
            app_id,
            continuation_token=continuation_token,
            lang='id',  # Bahasa review (Indonesia)
            country='id',  # Negara (Indonesia)
            sort=Sort.MOST_RELEVANT,
            filter_score_with=None,
            count=199
        )
        if not new_result:
            break
        result.extend(new_result)
        pbar.update(len(new_result))
        
        # membuat jeda utk menghindari rate limiting
        sleep(0.5)

25074it [02:56, 141.68it/s]                           


In [6]:
df = pd.DataFrame(result)

print("Kolom yang tersedia:", df.columns.tolist())

df = df[['reviewId', 'userName', 'content', 'score',
         'thumbsUpCount', 'reviewCreatedVersion', 'at', 'appVersion']]

csv_path = os.path.join(output_folder, 'tokopedia_reviews_full.csv')
df.to_csv(csv_path, index=False)
print(f"Data lengkap telah disimpan di: {csv_path}")

Kolom yang tersedia: ['reviewId', 'userName', 'userImage', 'content', 'score', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion']
Data lengkap telah disimpan di: D:\Code\Data Mining\Kuliah Data Mining\Praktikum10\tokopedia_reviews_full.csv


In [None]:
# Visualisasi distribusi rating
plt.figure(figsize=(10, 6))
sns.histplot(df.score, bins=5, kde=False, color='green', edgecolor='black')
plt.title('Distribusi Rating untuk Aplikasi Tokopedia', fontsize=15)
plt.xlabel('Rating', fontsize=12)
plt.ylabel('Jumlah Review', fontsize=12)
plt.xticks([1, 2, 3, 4, 5])
plt.savefig(os.path.join(output_folder, 'tokopedia_ratings_distribution.png'))
plt.close()

df['at'] = pd.to_datetime(df['at'])

# Analisis perubahan rating berdasarkan waktu (bulanan)
df['year_month'] = df['at'].dt.to_period('M')
df_recent = df[df['year_month'].dt.year >= 2022]
monthly_avg = df_recent.groupby('year_month')['score'].mean()

plt.figure(figsize=(12, 6))
monthly_avg.plot(kind='line', color='#17A589', linewidth=2.5, marker='o')
plt.title('Rata-rata Rating Bulanan Aplikasi Tokopedia', fontsize=15)
plt.xlabel('Bulan', fontsize=12)
plt.ylabel('Rata-rata Rating', fontsize=12)
plt.grid(color='gray', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig(os.path.join(output_folder, 'tokopedia_monthly_ratings.png'))
plt.close()

print("Proses scraping dan analisis selesai!")

Proses scraping dan analisis selesai!
