In [1]:
!pip install delta-spark==3.2.0 -q

In [2]:
import pyspark
from delta import *
from pyspark.sql.functions import *

# Create a SparkSession with Delta Lake extensions
# The '.config(...)' lines are crucial for enabling Delta Lake's features
builder = pyspark.sql.SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

# Get or create the SparkSession
spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark and Delta Lake are ready!")

Spark and Delta Lake are ready!


In [3]:
# managed
data = [('elakkiya' , 21),('varshini',22),('sanju', 20)]
df = spark.createDataFrame(data,['Name','Age'])
# save in delta format
df.write.format("delta").saveAsTable('managed_people')
# show the table
spark.sql('select * from managed_people').show()
# location is inside warehouse(managed by spark)
location = spark.sql('DESCRIBE DETAIL managed_people').collect()[0]['location']
print(location)

+--------+---+
|    Name|Age|
+--------+---+
|elakkiya| 21|
|varshini| 22|
|   sanju| 20|
+--------+---+

file:/content/spark-warehouse/managed_people


In [None]:
spark.sql('drop table managed_people')
# Deletes files and table info.

In [10]:
# unmanaged
import pandas as pd

file_path = '/content/drive/MyDrive/students.csv'
df = pd.read_csv(file_path)
df = spark.createDataFrame(df)
df.write.option("path", "/content/drive/MyDrive/people_data").saveAsTable('unmanaged_data')
spark.sql('select * from unmanaged_data').show()
location = spark.sql('DESCRIBE DETAIL unmanaged_data').collect()[0]['location']
print(location)

+-----+----+-------+-------+
| Name|Math|Science|English|
+-----+----+-------+-------+
|Kiran|  65|     70|     60|
|Anita|  88|     95|     90|
| Ravi|  55|     60|     58|
| Amit|  78|     85|     74|
|Priya|  92|     89|     96|
+-----+----+-------+-------+

file:/content/drive/MyDrive/people_data


In [None]:
spark.sql('drop table unmanaged_data')
# Deletes only metadata, files stay in Drive.