In [1]:
import pandas as pd

# for scraping app info from App Store
from itunes_app_scraper.scraper import AppStoreScraper

# for scraping app reviews from App Store
from app_store_scraper import AppStore

# for pretty printing data structures
from pprint import pprint

# for keeping track of timing
import datetime as dt
from tzlocal import get_localzone

# for building in wait times
import random
import time

## Set up loop to go through all apps
import os

In [2]:

## Read in file containing app names and IDs
app_df = pd.read_csv('Var/AppStore/apps.csv')
app_df.head()

Unnamed: 0,app_name,iOS_revs,iOS_app_name,iOS_app_id
0,BSI Mobile,6120,bsi-mobile,1410072458
1,Livin' by Mandiri,120,livin-by-mandiri,1555414743


In [3]:
## Get list of app names and app IDs
app_names = list(app_df['iOS_app_name'])
app_ids = list(app_df['iOS_app_id'])

In [4]:
## Set up App Store Scraper
scraper = AppStoreScraper()
app_store_list = list(scraper.get_multiple_app_details(app_ids))

## Pretty print the data for the first app
pprint(app_store_list[0])

{'advisories': '',
 'appletvScreenshotUrls': '',
 'artistId': 927331651,
 'artistName': 'PT Bank Syariah Indonesia, Tbk',
 'artistViewUrl': 'https://apps.apple.com/nl/developer/pt-bank-syariah-indonesia-tbk/id927331651?uo=4',
 'artworkUrl100': 'https://is5-ssl.mzstatic.com/image/thumb/Purple112/v4/2b/ff/0d/2bff0d80-d5a0-91d0-b50d-8077e9696f52/AppIcon-0-0-1x_U007emarketing-0-0-0-5-0-0-sRGB-0-0-0-GLES2_U002c0-512MB-85-220-0-0.png/100x100bb.jpg',
 'artworkUrl512': 'https://is5-ssl.mzstatic.com/image/thumb/Purple112/v4/2b/ff/0d/2bff0d80-d5a0-91d0-b50d-8077e9696f52/AppIcon-0-0-1x_U007emarketing-0-0-0-5-0-0-sRGB-0-0-0-GLES2_U002c0-512MB-85-220-0-0.png/512x512bb.jpg',
 'artworkUrl60': 'https://is5-ssl.mzstatic.com/image/thumb/Purple112/v4/2b/ff/0d/2bff0d80-d5a0-91d0-b50d-8077e9696f52/AppIcon-0-0-1x_U007emarketing-0-0-0-5-0-0-sRGB-0-0-0-GLES2_U002c0-512MB-85-220-0-0.png/60x60bb.jpg',
 'averageUserRating': 4,
 'averageUserRatingForCurrentVersion': 4,
 'bundleId': 'co.id.BSM-Mobile.MII',
 'conte

In [5]:
## Convert list of dicts to Pandas DataFrame and write to csv
app_info_df = pd.DataFrame(app_store_list)
app_info_df.to_csv('Var/AppStore/appsDetail.csv', index=False)
app_info_df.head()

Unnamed: 0,isGameCenterEnabled,advisories,features,supportedDevices,screenshotUrls,ipadScreenshotUrls,appletvScreenshotUrls,artworkUrl60,artworkUrl512,artworkUrl100,...,fileSizeBytes,sellerUrl,trackViewUrl,version,wrapperType,artistId,artistName,genres,price,userRatingCount
0,False,,,"iPhone5s-iPhone5s,iPadAir-iPadAir,iPadAirCellu...",https://is1-ssl.mzstatic.com/image/thumb/Purpl...,,,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,https://is5-ssl.mzstatic.com/image/thumb/Purpl...,...,117625856,https://www.bankbsi.co.id,https://apps.apple.com/nl/app/bsi-mobile/id141...,6.12.0,software,927331651,"PT Bank Syariah Indonesia, Tbk","Financiën,Zakelijk",0.0,1
1,False,,,"iPhone5s-iPhone5s,iPadAir-iPadAir,iPadAirCellu...",https://is2-ssl.mzstatic.com/image/thumb/Purpl...,,,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,https://is2-ssl.mzstatic.com/image/thumb/Purpl...,...,444793856,https://bmri.id/livin,https://apps.apple.com/nl/app/livin-by-mandiri...,1.2.0,software,516447725,PT. Bank Mandiri (Persero) Tbk.,"Financiën,Zakelijk",0.0,1


In [None]:
for app_name, app_id in zip(app_names, app_ids):
    
    # Get start time
    start = dt.datetime.now(tz=get_localzone())
    fmt= "%m/%d/%y - %T %p"
    
    # Print starting output for app
    print('---'*20)
    print('---'*20)    
    print(f'***** {app_name} started at {start.strftime(fmt)}')
    print()
    
    # Instantiate AppStore for app
    app_ = AppStore(country='id', app_name=app_name, app_id=app_id)
    
    # Scrape reviews posted since February 28, 2020 and limit to 10,000 reviews
    app_.review(how_many=10000,
                after=dt.datetime(2020, 2, 28),
                sleep=random.randint(20,25))
    
    reviews = app_.reviews
    
    # Add keys to store information about which app each review is for
    for rvw in reviews:
        rvw['app_name'] = app_name
        rvw['app_id'] = app_id
    
    # Print update that scraping was completed
    print(f"""Done scraping {app_name}. 
    Scraped a total of {app_.reviews_count} reviews.\n""")
    
    # Convert list of dicts to Pandas DataFrame and write to csv
    output_path = 'Var/AppStore/' + app_name + '.csv' 
    review_df = pd.DataFrame(reviews)
    review_df.to_csv(output_path, mode='a', header=not os.path.exists(output_path))
    
    # Get end time
    end = dt.datetime.now(tz=get_localzone())
    
    # Print ending output for app
    print(f"""Successfully wrote {app_name} reviews to csv
    at {end.strftime(fmt)}.\n""")
    print(f'Time elapsed for {app_name}: {end-start}')
    print('---'*20)
    print('---'*20)
    print('\n')
    
    # Wait 5 to 10 seconds to start scraping next app
    time.sleep(random.randint(5,10))