## Imports

In [None]:
# import pyspark functions
from pyspark.sql.functions import *
# import URL processing
import urllib

## AWS credentials

In [None]:
# list tables in filestore to get name of credentials file
dbutils.fs.ls("/FileStore/tables")

In [None]:
# Read the CSV file to spark dataframe, passing in options for the header row and separator
aws_keys_df = spark.read.format("csv")\
.option("header", "true")\
.option("sep", ",")\
.load("/FileStore/tables/authentication_credentials.csv")

In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.where(col('User name')=='databricks-user').select('Secret access key').collect()[0]['Secret access key']
# Encode the secret key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

## Mount S3 bucket

In [None]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-1215be80977f-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/user-1215be80977f-bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
# list the topics stored on the mounted S3 bucket
display(dbutils.fs.ls("/mnt/user-1215be80977f-bucket/topics"))

path,name,size,modificationTime
dbfs:/mnt/user-1215be80977f-bucket/topics/1215be80977f.geo/,1215be80977f.geo/,0,1692041344013
dbfs:/mnt/user-1215be80977f-bucket/topics/1215be80977f.pin/,1215be80977f.pin/,0,1692041344013
dbfs:/mnt/user-1215be80977f-bucket/topics/1215be80977f.user/,1215be80977f.user/,0,1692041344013


## Read contents of S3 bucket into dataframes

In [None]:
# list of topic suffixes
topics = [".pin", ".geo", ".user"]

def read_topics_into_dataframe(topic):
    # create path to topic files
    file_path = f"/mnt/user-1215be80977f-bucket/topics/1215be80977f{topic}/partition=0/*.json"
    # specify file type
    file_type = "json"
    # Ask Spark to infer the schema
    infer_schema = "true"
    # load JSONs from mounted S3 bucket to Spark dataframe
    df = spark.read.format(file_type) \
        .option("inferSchema", infer_schema) \
        .load(file_path)
    return df

for item in topics:
    # create statement strings for naming and displaying dataframes
    make_df_statement = f"df_{item[1:]} = read_topics_into_dataframe('{item}')"
    display_df_statement = f"display(df_{item[1:]})"
    # execute statements
    exec(make_df_statement)
    exec(display_df_statement)


## Unmount S3 bucket

In [None]:
# unmount the bucket from the filestore
dbutils.fs.unmount("/mnt/user-1215be80977f-bucket")