# Mount S3 Bucket to Databricks and get data.

## Imports

In [None]:
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib
from time import sleep

## AWS Credentials

In [None]:
# Define the path to the Delta table
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

## Mount S3 Bucket

In [None]:
# AWS S3 bucket name
AWS_S3_BUCKET = "user-128a59195de3-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/user-128a59195de3-bucket"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

# List topics stored on mounted S3 bucket
dbutils.fs.ls("/mnt/user-128a59195de3-bucket/topics")
# Read data from S3 bucket into dataframes
# List of topic suffixes
topics = [".pin", ".geo", ".user"]

## Read data from topics into dataframes

In [None]:
def read_topics_into_dataframe(topic):
    # create path to topic files
    file_path = f"/mnt/user-128a59195de3-bucket/topics/128a59195de3{topic}/partition=0/*.json"
    # specify file type
    file_type = "json"
    # Ask Spark to infer the schema
    infer_schema = "true"
    # load JSONs from mounted S3 bucket to Spark dataframe
    df = spark.read.format(file_type) \
        .option("inferSchema", infer_schema) \
        .load(file_path)
    return df

for item in topics:
    # create statement strings for naming and displaying dataframes
    make_df_statement = f"df_128a59195de3_{item[1:]} = read_topics_into_dataframe('{item}')"
    display_df_statement = f"display(df_128a59195de3_{item[1:]})"
    # execute statements
    exec(make_df_statement)
    exec(display_df_statement)
