# Sample Notebook

## Load Environemnt Varibles - AWS Keys

We use %%local to run code inside the local kernel, everything else goes through livy to run on the driver

In [10]:
%%local
from minio import Minio
from dotenv import dotenv_values
import os

config = dotenv_values("../.env")

In [11]:
# Imports
from pyspark.sql import functions as F
from pyspark.sql.functions import col, lit

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Read & Perform Basic Operations

In [12]:
# Read CSV from S3
iris_df = spark.read.csv("s3a://samples-csv-src/iris.csv", header=True, inferSchema=True)

# Add new column
iris_mod_df = iris_df.withColumn("new_variaty", F.substring("variety", 1, 1))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
%%pretty
iris_df.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

slength,swidth,plength,pwidth,variety
5.1,3.5,1.4,0.2,Setosa
4.9,3.0,1.4,0.2,Setosa


In [14]:
%%pretty
iris_mod_df.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

slength,swidth,plength,pwidth,variety,new_variaty
5.1,3.5,1.4,0.2,Setosa,S
4.9,3.0,1.4,0.2,Setosa,S


## Save Dataframe

Create a new bucket and save modified dataframe as parquet and a hive table

In [15]:
%%local
client = Minio("minio:10000", 
    access_key=config["AWS_ACCESS_KEY_ID"], 
    secret_key=config["AWS_SECRET_ACCESS_KEY"],
    secure=False
)

# Make bucket
if not client.bucket_exists("samples-csv-pre"):
    client.make_bucket("samples-csv-pre")

print("Bucket Created: ", client.bucket_exists("samples-csv-pre"))

Bucket Created:  True


In [16]:
# Create database
spark.sql("CREATE DATABASE IF NOT EXISTS samples_pre")

# Save df as hive table
(iris_mod_df.write
    .format("parquet")
    .mode("overwrite")
    .option("path", "s3a://samples-csv-pre/iris_mod")
    .saveAsTable("samples_pre.iris_mod")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

## Read from Hive Catalog

In [17]:
iris_hive_df = spark.sql("SELECT * FROM samples_pre.iris_mod")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [18]:
%%pretty
iris_hive_df.show(2)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

slength,swidth,plength,pwidth,variety,new_variaty
5.1,3.5,1.4,0.2,Setosa,S
4.9,3.0,1.4,0.2,Setosa,S
