In [1]:
!pip install delta-spark==3.2.0 -q
import pyspark
from delta import *
from pyspark.sql.functions import *

In [2]:
# Create a SparkSession with Delta Lake extensions
# The '.config(...)' lines are crucial for enabling Delta Lake's features
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Get or create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark and Delta Lake are ready!")

Spark and Delta Lake are ready!


In [3]:
# Demonstration of Managed and Unmanaged Tables
# Create a dataframe

data=[("Amit",28), ("Priya", 24), ("Rohan", 25)]
df = spark.createDataFrame(data, ['name', 'age'])

# Creating table
df.write.format("delta").saveAsTable("managed_people")

# Show table
spark.sql("Select * from managed_people").show()

# Checking its Location
location = spark.sql("DESCRIBE DETAIL managed_people").collect()[0]['location']
print("Managed Table Location: ", location)

+-----+---+
| name|age|
+-----+---+
|Priya| 24|
|Rohan| 25|
| Amit| 28|
+-----+---+

Managed Table Location:  file:/content/spark-warehouse/managed_people


In [None]:
spark.sql("DROP TABLE managed_people")

## Managed Tables

The above was example of managed table -
Where in when we delete the table , the corresponding dataframe also gets deleted and data gets deleted permanently.

This happens as df and table are in same cell. So accidently if we delete table all the data along with metadata gets deleted.


## Unmanaged Tables

When we read data from other formats like csv file or from dbfs in databricks case , even if we delete table the data still remains in form of files.

This happens as we only delete the df / table we use and not from original source.

These are called Unmanaged Tables

In [5]:
# Unmanaged Tables
data=[("Amit",28), ("Priya", 24), ("Rohan", 25)]

with open ('unmanaged.csv','w')as f:
  f.write("name,age\n")
  for name, age in data:
    f.write(f"{name},{age}\n")

In [7]:
df_unmanaged = spark.read.csv('/content/unmanaged.csv',header=True ,inferSchema=True)
df_unmanaged.show()

+-----+---+
| name|age|
+-----+---+
| Amit| 28|
|Priya| 24|
|Rohan| 25|
+-----+---+



In [8]:
type(df_unmanaged)

In [12]:
# Creating Table
df.write.format("delta").saveAsTable("unmanaged_people")

In [13]:
# Show table
spark.sql("Select * from unmanaged_people").show()

+-----+---+
| name|age|
+-----+---+
|Priya| 24|
|Rohan| 25|
| Amit| 28|
+-----+---+



In [14]:
# Deleting the table
spark.sql("drop table unmanaged_people")

DataFrame[]

In [9]:
## Additional and optional
# Saving as view

df_unmanaged.createOrReplaceTempView('unmanaged_view')
spark.sql("Select * from unmanaged_view").show()

+-----+---+
| name|age|
+-----+---+
| Amit| 28|
|Priya| 24|
|Rohan| 25|
+-----+---+



In [11]:
# Deleting view
spark.sql("drop view unmanaged_view")

DataFrame[]