In [14]:
import pandas as pd

from google_play_scraper import app, Sort, reviews

from pprint import pprint

import pymongo
from pymongo import MongoClient

import datetime as dt
from tzlocal import get_localzone

import random
import time

import os

In [15]:
client = MongoClient(host='localhost', port=27017)

app_proj_db = client['app_proj_db']

info_collection = app_proj_db['info_collection']

review_collection = app_proj_db['review_collection']

In [16]:
# app_df = pd.read_csv('Var/PlayStore/ScrapApp.csv.csv')
# app_df.head()
app_name = ['BCA Mobile', 'Livin by Mandiri', 'BSI Mobile', 'BNI Mobile', 'BRI Mobile']
android_appID = ['com.bca', 'id.bmri.livin', 'com.bsm.activity2', 'src.com.bni', 'id.co.bri.brimo']

In [17]:
app_names = list(app_name)
app_ids = list(android_appID)

In [None]:
app_info = []
for i in app_ids:
    info = app(i)
    del info['comments']
    app_info.append(info)

pprint(app_info[0])

In [None]:
info_collection.insert_many(app_info)

In [None]:
info_df = pd.DataFrame(list(info_collection.find({})))
info_df.head()

In [None]:
for app_name, app_id in zip(app_names, app_ids):
    
    start = dt.datetime.now(tz=get_localzone())
    fmt= "%m/%d/%y - %T %p"    
    
    print('---'*20)
    print('---'*20)    
    print(f'***** {app_name} started at {start.strftime(fmt)}')
    print()
    
    app_reviews = []
    
    count = 200
    
    batch_num = 0
    
    # Retrieve reviews (and continuation_token) with reviews function
    rvws, token = reviews(
        app_id,           # found in app's url
        lang='en',        # defaults to 'en'
        country='us',     # defaults to 'us'
        sort=Sort.NEWEST, # start with most recent
        count=count       # batch size
    )  # type: ignore
    
    for r in rvws:
        r['app_name'] = app_name # add key for app's name
        r['app_id'] = app_id     # add key for app's id
    
    
    app_reviews.extend(rvws)
    
    batch_num +=1 
    print(f'Batch {batch_num} completed.')
    
    time.sleep(random.randint(1,5))
    
    pre_review_ids = []
    for rvw in app_reviews:
        pre_review_ids.append(rvw['reviewId'])
    
    for batch in range(4999):
        rvws, token = reviews( # store continuation_token
            app_id,
            lang='en',
            country='us',
            sort=Sort.NEWEST,
            count=count,
            # using token obtained from previous batch
            continuation_token=token
        )
        
        new_review_ids = []
        for r in rvws:
            new_review_ids.append(r['reviewId'])
            
            r['app_name'] = app_name # add key for app's name
            r['app_id'] = app_id     # add key for app's id
    
        app_reviews.extend(rvws)
        
        batch_num +=1
        
        # Break loop and stop scraping for current app if most recent batch
        # did not add any unique reviews
        all_review_ids = pre_review_ids + new_review_ids
        if len(set(pre_review_ids)) == len(set(all_review_ids)):
            print(f'No reviews left to scrape. Completed {batch_num} batches.\n')
            break
        
        # all_review_ids becomes pre_review_ids to check against 
        # for next batch
        pre_review_ids = all_review_ids
        
        # At every 100th batch
        if batch_num%100==0:
            
            # print update on number of batches
            print(f'Batch {batch_num} completed.')
            
            # insert reviews into collection
            review_collection.insert_many(app_reviews)
            
            output_path = 'Var/PlayStore/' + app_name + '.csv' 
            review_df = pd.DataFrame(app_reviews)
            review_df.to_csv(output_path, mode='a', header=not os.path.exists(output_path))
            
            # print update about num reviews inserted
            store_time = dt.datetime.now(tz=get_localzone())
            print(f"""
            Successfully inserted {len(app_reviews)} {app_name} 
            reviews into collection at {store_time.strftime(fmt)}.\n
            """)
            
            # empty our list for next round of 100 batches
            app_reviews = []
        
        time.sleep(random.randint(1,5))
    
    # Print update when max number of batches has been reached
    # OR when last batch didn't add any unique reviews
    print(f'Done scraping {app_name}.')
    print(f'Scraped a total of {len(set(pre_review_ids))} unique reviews.\n')
    
    
    # Insert remaining reviews into collection
    review_collection.insert_many(app_reviews)
    
    output_path = 'Var/PlayStore/' + app_name + '.csv' 
    review_df = pd.DataFrame(app_reviews)
    review_df.to_csv(output_path, mode='a', header=not os.path.exists(output_path))
    
    # Get end time
    end = dt.datetime.now(tz=get_localzone())
    
    # Print ending output for app
    print(f"""
    Successfully inserted all {app_name} reviews into collection
    at {end.strftime(fmt)}.\n
    """)
    print(f'Time elapsed for {app_name}: {end-start}')
    print('---'*20)
    print('---'*20)
    print('\n')
    
    time.sleep(random.randint(1,5))