# Apple Podcast Review Scraping with the app_store_scraper

This program is a wrapper for scraping the Apple Podcast Reviews with the **app-store-scraper** (thank you Eric Lim, see https://pypi.org/project/app-store-scraper/, MIT license). The reviews are then cleaned up with regular expressions and an emoji libary (thank you Neel Sha, see https://pypi.org/project/emot/ or https://github.com/NeelShah18/emot, GNU General Public License)

To get started:
1. Set the right app_id, app_name and country under the **Set input** section
2. Make sure you have the emoji.py file in the same folder as this notebook
3. Run everything


## Import modules

In [None]:
!pip install app-store-scraper

In [None]:
from app_store_scraper import Podcast
from pprint import pprint
import os
import pandas as pd
import re

import emoji

In [None]:
# Define function to convert the emoji's to actual text
def convert_emojis(text):
    for emot in emoji.UNICODE_EMOJI:
        text = text.replace(emot, "_".join(emoji.UNICODE_EMOJI[emot].replace(",","").split()))
    return text

## Set input

In [None]:
# Set input
app_id = 437299706
app_name = 'last-podcast-on-the-left'
country = 'us'

# Set output path
path_out = r'...'

if not(os.path.isdir(path_out)):
    raise Exception(f"Output folder ({path_out}) does not exist, please create it first.")
    
filename_csv = f'{app_name}_reviews_table.csv'
file_csv = os.path.join(path_out, filename_csv)

In [None]:
# Set to a very large number to make sure you get everything
n = 100000000

# Create class object
sysk = Podcast(country=country, app_name=app_name, app_id=app_id)

# Collect reviews
sysk.review(how_many=n)

#pprint(sysk.reviews)
pprint(sysk.reviews_count)

## Clean review content and store list of reviews to one dictionary

A review item is a dictionary with the following content (isEdited is not there somehow)

{
    "date": datetime.datetime,
    "isEdited": bool,
    "rating": int,
    "review": str,
    "title": str,
    "userName": str
 }

In [None]:
# Set up empty lists
timestamps = []
reviews = []
reviews_raw = []
titles = []
usernames = []
ratings = []
#isediteds = []

# Define datetime format
dt_format = '%Y%m%dT%H%M%S'

# Set-up empty dictionary
D = {}

# Iterate over review items
for review_item in sysk.reviews:
    
    # Convert datetime to string
    timestamp = review_item['date'].strftime(dt_format)
    
    # Strip strings and store all items to easier variable names
    review = review_item['review'].strip()
    title = review_item['title'].strip()
    username = review_item['userName'].strip()
    rating = review_item['rating']
    
    # Store a version with minimal adjustments
    review_raw = re.sub(r'\t', ' ', review)
    reviews_raw.append(review_raw)
    
    # Clean up the review text with regexes
    review = re.sub(r'…', "...", review) # replace fancy triple period with three actual periods
    review = re.sub(r'&', "and", review) # replace the ampersand with and
    review = re.sub(r'[‘’]', "'", review) # replace fancy single quotes to regular single quotes
    review = re.sub(r'[“”]', '"', review) # replace fancy double quotes to regular double quotes
    review = re.sub(r'[\n\t]', ' ', review) # replace newline and tab characters with a whitespace
    review = re.sub(r'\s+', ' ', review) # replace duplicate whitespaces with a single whitespace
    review = re.sub(r'\?{2,}', '?', review) # replace duplicate question marks with a single question mark
    review = re.sub(r'\!{2,}', '!', review) # replace duplciate exclamation marks with a single exclamation mark
    
    title = re.sub(r'\?{2,}', '?', title) # replace duplicate question marks with a single question mark
    title = re.sub(r'\!{2,}', '!', title) # replace duplciate exclamation marks with a single exclamation mark
    title = re.sub(r'\s+', ' ', title) # replace duplicate whitespaces with a single whitespace
    title = re.sub(r'[‘’]', "'", title) # replace fancy single quotes to regular single quotes
    title = re.sub(r'[“”]', '"', title) # replace fancy double quotes to regular double quotes
    title = re.sub(r'&', "and", title) # replace the ampersand with and
    title = re.sub(r'…', "...", title) # replace fancy triple period with three actual periods
    
    title = title.title() # Only the first letter is capitalized
    
    review = convert_emojis(review) # Convert the emojis to text
    title = convert_emojis(title) # Convert the emojis to text
    username = convert_emojis(username) # Convert the emojis to text
    
    # Append to list
    timestamps.append(timestamp)
    reviews.append(review)
    titles.append(title)
    usernames.append(username)
    ratings.append(rating)
    #isediteds.append(review_item['isEdited'])
    
# Add to dictionary
D['user'] = usernames
D['timestamp'] = timestamps
D['rating'] = ratings
#D['isedited'] = isediteds
D['title'] = titles
D['review'] = reviews
D['review_raw'] = reviews_raw

## Export to CSV

In [None]:
# Store to dataframe
df = pd.DataFrame(D)

# Export to .csv
df.to_csv(file_csv, index=False, sep="\t")
print(f'Exported to {file_csv}')

In [None]:
df

## Read data from file

In [None]:
df = pd.read_csv(file_csv, sep='\t', parse_dates=['timestamp'])

In [None]:
df

## Quick analysis

In [None]:
df['timestamp'] =  pd.to_datetime(df['timestamp'], format='%Y%m%dT%H%M%S')
df['month'] = df['timestamp'].dt.month
df['year'] = df['timestamp'].dt.year

dfmonthly = df.groupby(["year", "month"]).median()

In [None]:
dfmonthly['rating'].plot();