####Databricks widget types  : text, dropdown, combobox, multiselect

In [0]:
employee_source_path = "/FileStore/tables/EmployeeRel.csv"
department_source_path = "/FileStore/tables/DepartmentRel.csv"

dbutils.widgets.text("employee_widget_source_path", employee_source_path)
dbutils.widgets.text("department_widget_source_path", department_source_path)

In [0]:
dbutils.widgets.get("employee_widget_source_path")
print(f"emp: {dbutils.widgets.get('employee_widget_source_path')} , dept:{ dbutils.widgets.get('department_widget_source_path')}")

#### Employee Data Frame

In [0]:
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType,StructField, StringType,IntegerType,DoubleType

empSchema = StructType([ 
    StructField("EmpId",IntegerType(),True), 
    StructField("Emp",StringType(),True), 
    StructField("DeptId",IntegerType(),True), 
    StructField("Salary",DoubleType(),True) 
  ])

empDF = spark.read.csv(
    path=f"{dbutils.widgets.get('employee_widget_source_path')}",
    sep="|",
    header=True,
    schema=empSchema
)

empDF=empDF.withColumn("ingestionDate", current_timestamp())
empDF= empDF.select("EmpId",col("Emp").alias("EmpName"),"Salary","DeptId", "ingestionDate")
display(empDF)


####Department Data Frame

In [0]:
from pyspark.sql.functions import col, lit, current_timestamp
from pyspark.sql.types import StructType,StructField, StringType,IntegerType,DoubleType

deptSchema = StructType([ 
    StructField("DeptId",IntegerType(),True), 
    StructField("Dept",StringType(),True) 
  ])

deptDF = spark.read.csv(
    path=f"{department_source_path}",
    sep="|",
    header=True,
    schema=deptSchema
    )

deptDF=deptDF.withColumn("ingestionDate", current_timestamp())
deptDF= deptDF.select("DeptId",col("Dept").alias("DeptName"),"ingestionDate")
display(deptDF)

## Lake database

In [0]:
%fs
ls  dbfs:/FileStore/lakeDB/



In [0]:
%fs
ls dbfs:/FileStore/tables/

path,name,size,modificationTime
dbfs:/FileStore/tables/DepartmentRel.csv,DepartmentRel.csv,62,1690344550000
dbfs:/FileStore/tables/EmpPay.csv,EmpPay.csv,97,1689932930000
dbfs:/FileStore/tables/EmployeeRel.csv,EmployeeRel.csv,118,1690304274000
dbfs:/FileStore/tables/FlightData.csv,FlightData.csv,536,1690345771000
dbfs:/FileStore/tables/SalesData.csv,SalesData.csv,617,1689689602000
dbfs:/FileStore/tables/employees.csv,employees.csv,61,1689692632000


In [0]:
%fs
rm -r dbfs:/FileStore/lakeDB

In [0]:
db_name = "test"
dbutils.widgets.text("lake_db_name", db_name)
dbutils.widgets.text("lake_db_path", "/FileStore/lakeDB") 


In [0]:
%sql
select '${lake_db_name}' as DB_Name , '${lake_db_path}' as DB_Path


####Create Schema or Database

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS ${lake_db_name}  LOCATION "${lake_db_path}";

In [0]:
for t in spark.catalog.listDatabases():
    print(f"DB_NAME: {t.name}, DB_LOCATION : {t.locationUri}")

DB_NAME: default, DB_LOCATION : dbfs:/user/hive/warehouse
DB_NAME: test, DB_LOCATION : dbfs:/FileStore/lakeDB


#### Create Managed Table

In [0]:
#OPTION 1 Using DataFrame without  table defination
empDF.write.mode("append").saveAsTable(f"{dbutils.widgets.get('lake_db_name')}.employee_managed")

In [0]:
%sql
---OPTION 2 Using TempView and table defination
CREATE TABLE ${lake_db_name}.employee_managed1(
    EmpId INTEGER,
    EmpName STRING,
    Salary DOUBLE,
    DeptId INTEGER,
    ingestionDate TIMESTAMP NOT NULL
);



In [0]:
#OPTION 2 using TemView

empDF.createOrReplaceTempView("employeeTempView")
spark.sql(f"INSERT INTO {db_name}.employee_managed1 SELECT * FROM employeeTempView")

In [0]:
%sql
describe extended test.employee_managed

col_name,data_type,comment
EmpId,int,
EmpName,string,
Salary,double,
DeptId,int,
ingestionDate,timestamp,
,,
# Delta Statistics Columns,,
Column Names,"DeptId, EmpId, Salary, ingestionDate, EmpName",
Column Selection Method,first-32,
,,


#### Read Managed Table

In [0]:
#OPTION 1
employee_managedDF = spark.table(f"{dbutils.widgets.get('lake_db_name')}.employee_managed1")
employee_managedDF.show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-2700949983852878>:2[0m
[1;32m      1[0m [38;5;66;03m#OPTION 1[39;00m
[0;32m----> 2[0m employee_managedDF [38;5;241m=[39m spark[38;5;241m.[39mtable([38;5;124mf[39m[38;5;124m"[39m[38;5;132;01m{[39;00mdbutils[38;5;241m.[39mwidgets[38;5;241m.[39mget([38;5;124m'[39m[38;5;124mlake_db_name[39m[38;5;124m'[39m)[38;5;132;01m}[39;00m[38;5;124m.employee_managed1[39m[38;5;124m"[39m)
[1;32m      3[0m employee_managedDF[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m(

In [0]:
# OPTION 2
employee_managedDF1 = spark.sql("select * from test.employee_managed")
employee_managedDF1.show()

In [0]:
%sql
----OPTION 3--
select * from test.employee_managed


####Create External Table

In [0]:
%sql
select '${department_widget_source_path} '

/FileStore/tables/DepartmentRel.csv
/FileStore/tables/DepartmentRel.csv


In [0]:
%sql
-- OPTION 1 USING LOCATION
CREATE TABLE test.departmentExt1 USING CSV LOCATION "${department_widget_source_path}"


In [0]:
%sql
--OPTION 2  USING OPTIONS
CREATE TABLE test.departmentExt2
USING csv
OPTIONS (
  path "${department_widget_source_path}",
  header 'true',
  inferSchema 'true',
  sep '|'
)

In [0]:
%sql
---OPTION 1 with default CSV delimiter

CREATE TABLE test.departmentExt3 USING CSV LOCATION "dbfs:/FileStore/tables/FlightData.csv"



In [0]:
%sql
describe extended test.departmentExt1

col_name,data_type,comment
_c0,string,
,,
# Detailed Table Information,,
Catalog,spark_catalog,
Database,test,
Table,departmentext1,
Owner,root,
Created Time,Fri Jul 28 07:21:17 UTC 2023,
Last Access,UNKNOWN,
Created By,Spark 3.4.0,


#### Read External Table

In [0]:
#OPTION 1
departmentExt1_DF = spark.table(f"{dbutils.widgets.get('lake_db_name')}.departmentExt1")
departmentExt1_DF.show()

+-----------+
|        _c0|
+-----------+
|DeptId|Dept|
|    1|Sales|
|  2|Finance|
|3|Marketing|
|        |HR|
|5|Reporting|
+-----------+



In [0]:
# OPTION 2
departmentExt2_DF = spark.sql("select * from test.departmentExt2")
departmentExt2_DF.show()

+------+---------+
|DeptId|     Dept|
+------+---------+
|     1|    Sales|
|     2|  Finance|
|     3|Marketing|
|  null|       HR|
|     5|Reporting|
+------+---------+



In [0]:
%sql
----OPTION 3--
select * from test.departmentExt3

#### DROP TABLE

In [0]:
%sql
--DROP MANAGED TABLE---
DROP TABLE test.employee_managed

In [0]:
%sql
--DROP EXTERNAL TABLE---
DROP TABLE test.departmentExt2

####DROP DATABASE

In [0]:
%sql

select '${lake_db_name}'


test
test


In [0]:
%sql
DROP DATABASE IF EXISTS ${lake_db_name} CASCADE