In [0]:
%run "./utils"

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

# Base volume path (make sure volume is created via Unity Catalog first)
volume_base_path = "/Volumes/my_catalog/my_schema/my_volume"

# Input file paths (inside volume)
employee_csv_path = f"{volume_base_path}/Employee-Q1.csv"
department_csv_path = f"{volume_base_path}/Department-Q1.csv"
country_csv_path = f"{volume_base_path}/Country-Q1.csv"

# Output paths (writing to bronze layer within volume)
bronze_employee_path = f"{volume_base_path}/bronze/employee"
bronze_department_path = f"{volume_base_path}/bronze/department"
bronze_country_path = f"{volume_base_path}/bronze/country"

# Read source CSVs from volume
employee_df = spark.read.option("header", True).csv(employee_csv_path)
department_df = spark.read.option("header", True).csv(department_csv_path)
country_df = spark.read.option("header", True).csv(country_csv_path)

# Write to bronze layer inside the volume
employee_df.write.mode("overwrite").option("header", True).csv(bronze_employee_path)
department_df.write.mode("overwrite").option("header", True).csv(bronze_department_path)
country_df.write.mode("overwrite").option("header", True).csv(bronze_country_path)


In [0]:
import sys
display({"python_version": sys.version})

In [0]:
# Example: Write a small dataframe back to CSV manually

df = spark.read.option("header", True).csv("/Volumes/my_catalog/my_schema/my_volume/Employee-Q1.csv")  # Read from workspace
df.show()
# Collect data to driver
data = df.collect()

# Extract headers
columns = df.columns

# Use Python to write to a CSV
with open("/Volumes/my_catalog/my_schema/my_volume/employee.csv", "w") as f:
    f.write(",".join(columns) + "\n")
    for row in data:
        f.write(",".join([str(x) for x in row]) + "\n")

# Now download it manually from the UI

df1 = spark.read.option("header", True).csv("/Volumes/my_catalog/my_schema/my_volume/employee.csv")  # Read from workspace
df1.show()

In [0]:
df1 = spark.read.option("header", True).csv("/Volumes/my_catalog/my_schema/my_volume/bronze/employee/")  # Read from workspace
df1.show()

df2 = spark.read.option("header", True).csv("/Volumes/my_catalog/my_schema/my_volume/bronze/country/")  # Read from workspace
df2.show()

df3 = spark.read.option("header", True).csv("/Volumes/my_catalog/my_schema/my_volume/bronze/department/")  # Read from workspace
df3.show()