In [0]:
# Configuration and Imports
import boto3
import logging
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import col

# set up logging 
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)


In [0]:
# Helper Function Definitions

# Initialize the S3 client
def get_s3_client():
    """
    Initializes and returns an S3 client.
    """
    try:
        s3_client = boto3.client('s3')
        logging.info("Successfully initialized S3 client.")
        return s3_client
    except Exception as e:
        logging.error(f"Failed to create S3 client: {e}")
        raise

# Get the latest file in a specified S3 folder
def get_latest_s3_file(s3_client, bucket_name, folder_path):
    """
    List and get the most recent file from an S3 folder.
    """
    try:
        response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=folder_path)
        files = [(obj['Key'], obj['LastModified']) for obj in response.get('Contents', []) if 'LastModified' in obj]
        if not files:
            logging.warning(f"No files found in S3 bucket '{bucket_name}' with prefix '{folder_path}'.")
            return None
        latest_file = sorted(files, key=lambda x: x[1], reverse=True)[0][0]
        logging.info(f"Latest file found: {latest_file}")
        return latest_file
    except Exception as e:
        logging.error(f"Error retrieving the latest file: {e}")
        raise

# Read data from a CSV file in S3 into a Spark DataFrame
def read_csv_to_spark(s3_path, schema):
    """
    Read a CSV file from S3 into a Spark DataFrame.
    """
    try:
        df = spark.read.csv(s3_path, header=True, schema=schema)
        logging.info(f"Successfully read data from: {s3_path}")
        return df
    except Exception as e:
        logging.error(f"Error reading CSV file from S3 path {s3_path}: {e}")
        raise

# Process the DataFrame to filter and select required columns
def process_reviews_dataframe(df):
    """
    Filter out rows with null content and select necessary columns.
    """
    filtered_df = df.filter(col("content").isNotNull())
    required_data_df = filtered_df.select('content', 'reviewCreatedVersion', 'score', 'at', 'reviewId') \
                                 .withColumnRenamed("at", "review_timestamp") \
                                 .withColumnRenamed("reviewId", "review_id")
    logging.info("Filtered and selected required columns from the DataFrame.")
    return required_data_df

# Save the processed DataFrame as a Delta table
def save_as_delta_table(df, delta_table_path):
    """
    Save the processed DataFrame as a Delta table.
    """
    try:
        df.write.format("delta").mode("append").save(delta_table_path)
        logging.info(f"Data successfully saved to Delta table at: {delta_table_path}")
    except Exception as e:
        logging.error(f"Error saving DataFrame to Delta table: {e}")
        raise


In [0]:
# Configuration Setup

# Define constants for S3 bucket and paths
BUCKET_NAME = 'topic-prediction'
FOLDER_PATH = 'raw_data'
DELTA_TABLE_PATH = "/mnt/topic-prediction/delta/reviews" 

# Define the schema for reading the CSV file
SCHEMA = StructType([
    StructField("reviewId", StringType(), True),
    StructField("userName", StringType(), True),
    StructField("userImage", StringType(), True),
    StructField("content", StringType(), True),
    StructField("score", IntegerType(), True),
    StructField("thumbsUpCount", IntegerType(), True),
    StructField("reviewCreatedVersion", StringType(), True),
    StructField("at", TimestampType(), True),
    StructField("replyContent", StringType(), True),
    StructField("repliedAt", TimestampType(), True),
    StructField("appVersion", IntegerType(), True)
])


In [0]:
# Main Workflow

try:
    #Initialize the S3 client
    s3_client = get_s3_client()

    #Retrieve the latest file from the specified folder
    latest_file = get_latest_s3_file(s3_client, BUCKET_NAME, FOLDER_PATH)

    #If a file is found, continue processing
    if latest_file:
        s3_path = f"s3a://{BUCKET_NAME}/{latest_file}" 
        raw_df = read_csv_to_spark(s3_path, SCHEMA)  
        processed_df = process_reviews_dataframe(raw_df) 
        save_as_delta_table(processed_df, DELTA_TABLE_PATH)  

        # Show a small sample for validation
        display(processed_df.limit(5))
    else:
        logging.warning("No files found to process.")

except Exception as e:
    logging.error(f"Data processing pipeline failed: {e}")
    