In [0]:
import os
import datetime
import pandas as pd
from google_play_scraper import reviews, Sort
import boto3
import logging
from io import StringIO
import csv

# Setup logging to display messages directly in the notebook
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


In [0]:
# Configure AWS S3 credentials
AWS_ACCESS_KEY_ID = dbutils.secrets.get(scope="s3_secrets", key="AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = dbutils.secrets.get(scope="s3_secrets", key="AWS_SECRET_ACCESS_KEY")
AWS_DEFAULT_REGION = dbutils.secrets.get(scope="s3_secrets", key="AWS_DEFAULT_REGION")

# Create an S3 client using boto3
s3_client = boto3.client(
    service_name='s3',
    region_name=AWS_DEFAULT_REGION,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)
logging.info("Successfully created S3 client.")


In [0]:
def scrape_reviews(product_id, date_filter):
    logging.info(f"Scraping reviews for product ID: {product_id}")
    result, _ = reviews(
        product_id,
        lang='en',
        country='us',
        sort=Sort.NEWEST,
        count=200  # Increased count to get more data
    )

    # Convert to DataFrame and filter by date
    df = pd.DataFrame(result)
    filtered_df = df[df['at'].dt.date == date_filter]
    logging.info(f"Scraped {len(filtered_df)} reviews for date {date_filter}.")
    return filtered_df


In [0]:
PRODUCT_ID = "droom.sleepIfUCan"
BUCKET_NAME = 'topic-prediction'
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)

scraped_reviews_df = scrape_reviews(PRODUCT_ID, yesterday)


# iterate over each column in the DataFrame
for col in scraped_reviews_df.columns:
    if scraped_reviews_df[col].dtype == object:
        # remove newline characters by replacing them with nothing
        scraped_reviews_df[col] = scraped_reviews_df[col].str.replace('\n', '', regex=False)


if not scraped_reviews_df.empty:
    # create a unique file key with timestamp for S3
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
    file_key = f'raw_data/raw_data_{timestamp}.csv'

    # upload to S3
    csv_buffer = StringIO()
    scraped_reviews_df.to_csv(csv_buffer, index=False)
    s3_client.put_object(Bucket=BUCKET_NAME, Key=file_key, Body=csv_buffer.getvalue())
    logging.info(f"Uploaded file {file_key} to bucket {BUCKET_NAME}.")
else:
    logging.warning(f"No reviews found for the date: {yesterday}")

