# Below is the code used for mounting S3 bucket to Databricks and displaying Spark dataframe to check its content

In [None]:
# Importing libraries
# pyspark functions
from pyspark.sql.functions import *
# URL processing
import urllib

In [None]:
# Reading the table containing the AWS keys to Databricks. Full access to S3 have already been granted and access credentials uploaded.
delta_table_path = "dbfs:/user/hive/warehouse/authentication_credentials"

# Read the Delta table to a Spark DataFrame
aws_keys_df = spark.read.format("delta").load(delta_table_path)

In [None]:
# Get the AWS access key and secret key from the spark dataframe
ACCESS_KEY = aws_keys_df.select('Access key ID').collect()[0]['Access key ID']
SECRET_KEY = aws_keys_df.select('Secret access key').collect()[0]['Secret access key']
# Encode the secrete key
ENCODED_SECRET_KEY = urllib.parse.quote(string=SECRET_KEY, safe="")

In [None]:
# Now the S3 bucket can be mounted by passing in the S3 URL and the desired mount name to dbutils.fs.mount()

# AWS S3 bucket name
AWS_S3_BUCKET = "user-0e172e8c4bc3-bucket"
# Mount name for the bucket
MOUNT_NAME = "/mnt/mount_S3_PDP_mile6"
# Source url
SOURCE_URL = "s3n://{0}:{1}@{2}".format(ACCESS_KEY, ENCODED_SECRET_KEY, AWS_S3_BUCKET)
# Mount the drive
dbutils.fs.mount(SOURCE_URL, MOUNT_NAME)

In [None]:
# To check if the S3 bucket was mounted succesfully run the following command:
display(dbutils.fs.ls("/mnt/MOUNT_NAME/../.."))

In [None]:
# If inside the mounted S3 bucket your data is organised in folders, you can specify the whole path in the above command
display(dbutils.fs.ls("/mnt/mount_S3_PDP_mile6/topics/0e172e8c4bc3.pin/partition=0/"))

In [None]:
# Read the JSON format dataset from S3 into Databricks using the code cells below:

%sql
-- Disable format checks during the reading of Delta tables
SET spark.databricks.delta.formatCheck.enabled=false

In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/mount_S3_PDP_mile6/topics/0e172e8c4bc3.pin/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket 0e172e8c4bc3.pin topic
df_pin = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_pin)

In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/mount_S3_PDP_mile6/topics/0e172e8c4bc3.geo/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket 0e172e8c4bc3.geo topic
df_geo = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_geo)

In [None]:
# File location and type
# Asterisk(*) indicates reading all the content of the specified file that have .json extension
file_location = "/mnt/mount_S3_PDP_mile6/topics/0e172e8c4bc3.user/partition=0/*.json" 
file_type = "json"
# Ask Spark to infer the schema
infer_schema = "true"
# Read in JSONs from mounted S3 bucket 0e172e8c4bc3.user topic
df_user = spark.read.format(file_type) \
.option("inferSchema", infer_schema) \
.load(file_location)
# Display Spark dataframe to check its content
display(df_user)