In [0]:
from pyspark.sql.types import *
from pyspark.sql import functions as F

## Defining the data type before or after laoding the data from csv 
#order csv file data type

orders_schema = StructType([
    StructField("Row ID", IntegerType(), True),
    StructField("Order ID", StringType(), True),
    StructField("Order Date", StringType(), True),
  StructField("Ship Date", StringType(), True),
    StructField("Ship Mode", StringType(), True),
    StructField("Customer ID", StringType(), True),
    StructField("Customer Name", StringType(), True),
    StructField("Segment", StringType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("Country", StringType(), True),
    StructField("Postal Code", StringType(), True),
    StructField("Region", StringType(), True),
    StructField("Product ID", StringType(), True),
    StructField("Category", StringType(), True),
    StructField("Sub-Category", StringType(), True),
    StructField("Product Name", StringType(), True),
    StructField("Sales", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("Discount", DoubleType(), True),
    StructField("Profit", DoubleType(), True),
    StructField("Shipping Cost", DoubleType(), True),
    StructField("Order Priority", StringType(), True),
])

# Return csv file data type

returns_schema = StructType([
    StructField("Returned", StringType(), True),
    StructField("Order ID", StringType(), True),
    StructField("Market", StringType(), True),
])

## loading the CSV file into notebook 
# loading the order csv file 

orders_df = (spark.read.format("csv")
  .option("header", True)
  .option("mode", "PERMISSIVE")
  .schema(orders_schema)
  .load("dbfs:/Volumes/globalsuperstore/raw_file/raw_superstore_order_volume/*.csv")
  .dropDuplicates()
  .withColumn("ingestion_ts",F.from_utc_timestamp(F.current_timestamp(), "Asia/Kolkata"))
  .withColumn("source_file", F.col("_metadata.file_path")))

# Loading the return csv file

returns_df = (spark.read.format("csv")
  .option("header", True)
  .option("mode", "PERMISSIVE")
  .schema(returns_schema)
  .load("dbfs:/Volumes/globalsuperstore/raw_file/raw_superstore_return_volume/*.csv")
  .dropDuplicates()
  .withColumn("ingestion_ts",F.from_utc_timestamp(F.current_timestamp(), "Asia/Kolkata"))
  .withColumn("source_file", F.col("_metadata.file_path")))

# Casting the Columns to actual data type

orders_df_clean = (orders_df
                   
  .withColumn("sales_str", F.regexp_replace(F.regexp_replace(F.col("Sales"), r"\$", ""), ",", ""))

  .withColumn("Sales", F.expr("try_cast(nullif(sales_str, '') as  decimal(18,4))"))

  .drop("sales_str")

  .withColumn("Date", F.concat(F.expr("right(`Order Date`, 4)"), F.lit("-"), F.expr("left(right(`Order Date`, 7), 2)"), F.lit("-"), ( F.when(
          F.length(F.trim(F.col("Order Date"))) == 10,
          F.expr("left(`Order Date`, 2)")
      ).when(
          F.length(F.trim(F.col("Order Date"))) == 9,
          F.expr("left(`Order Date`, 1)")
      ).otherwise(F.lit(None)))))
  
  .withColumn("Order Date", F.expr("try_cast(nullif(Date, '') as  Date)"))

  .drop("Date")

  .withColumn("Date", F.concat(F.expr("right(`Ship Date`, 4)"), F.lit("-"), F.expr("left(right(`Ship Date`, 7), 2)"), F.lit("-"), ( F.when(
          F.length(F.trim(F.col("Ship Date"))) == 10,
          F.expr("left(`Ship Date`, 2)")
      ).when(
          F.length(F.trim(F.col("Ship Date"))) == 9,
          F.expr("left(`Ship Date`, 1)")
      ).otherwise(F.lit(None)))))
  
  .withColumn("Ship Date", F.expr("try_cast(nullif(Date, '') as  Date)"))

  .drop("Date")

)

  
## Defining the source schema 
# Defining the folder name for delta files 

bronze_base = "dbfs:/Volumes/globalsuperstore/bronze/bronze_superstore"
orders_path  = f"{bronze_base}/orders_bronze_cm"
returns_path = f"{bronze_base}/returns_bronze_cm"

## Writing the Delta files in Bronze schema
# Writing order table

(orders_df_clean.write.format("delta")
  .mode("overwrite")
  .option("overwriteSchema","true")
  .option("delta.columnMapping.mode","name")
  .option("delta.minReaderVersion","2")
  .option("delta.minWriterVersion","5")
  .save(orders_path))

# Writing return table

(returns_df.write.format("delta")
  .mode("overwrite")
  .option("overwriteSchema","true")
  .option("delta.columnMapping.mode","name")
  .option("delta.minReaderVersion","2")
  .option("delta.minWriterVersion","5")
  .save(returns_path))


spark.sql(f"OPTIMIZE delta.`{orders_path.rstrip('/')}` ZORDER BY (`Order Date`, `Order ID`, `Customer ID`)")
spark.sql(f"OPTIMIZE delta.`{returns_path.rstrip('/')}`")



[0;31m---------------------------------------------------------------------------[0m
[0;31m_InactiveRpcError[0m                         Traceback (most recent call last)
File [0;32m/databricks/python/lib/python3.11/site-packages/pyspark/sql/connect/client/core.py:1859[0m, in [0;36mSparkConnectClient.config[0;34m(self, operation)[0m
[1;32m   1858[0m [38;5;28;01mwith[39;00m attempt:
[0;32m-> 1859[0m     resp [38;5;241m=[39m [38;5;28mself[39m[38;5;241m.[39m_stub[38;5;241m.[39mConfig(req, metadata[38;5;241m=[39m[38;5;28mself[39m[38;5;241m.[39mmetadata())
[1;32m   1860[0m     [38;5;28mself[39m[38;5;241m.[39m_verify_response_integrity(resp)

File [0;32m/databricks/python/lib/python3.11/site-packages/grpc/_interceptor.py:277[0m, in [0;36m_UnaryUnaryMultiCallable.__call__[0;34m(self, request, timeout, metadata, credentials, wait_for_ready, compression)[0m
[1;32m    268[0m [38;5;28;01mdef[39;00m [38;5;21m__call__[39m(
[1;32m    269[0m     [38;5;2

In [0]:
print("Life")

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:132)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:132)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:129)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:129)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:715)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:435)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:435)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# # Define the paths
# orders_path = "dbfs:/Volumes/globalsuperstore/bronze/bronze_superstore/orders_bronze_cm/"
# returns_path = "dbfs:/Volumes/globalsuperstore/bronze/bronze_superstore/returns_bronze_cm/"

# # Read the Delta files
# orders_df = spark.read.format("delta").load(orders_path)
# returns_df = spark.read.format("delta").load(returns_path)

# # Display the DataFrames
# orders_df_clean.limit(5).display()
# orders_df.limit(5).display()
# returns_df.limit(5).display()
# display(orders_df.limit(5))
# display(returns_df.limit(5))

In [0]:
spark.sql(f"CREATE OR REPLACE VIEW globalsuperstore.bronze.master_superstore AS SELECT * FROM delta.`dbfs:/Volumes/globalsuperstore/bronze/bronze_superstore/orders_bronze_cm/`")

DataFrame[]