####Read CSV from ADLS in Databricks (using Storage Account Key)

In [0]:
# Step 1: Set the storage account key
spark.conf.set("fs.azure.account.key.hexdatastoragegen2.dfs.core.windows.net", 
               "3yXeqbl+vxjfPvR/0TDxvMsa7Q56vlBDcdyXjput7irnJ4TK6rHd/ETsfIBlCqO68tdiqKvAvf1n+AStOdiylw==")

# Step 2: Read the CSV from ADLS
df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("abfss://datacontainer@hexdatastoragegen2.dfs.core.windows.net/sales/sales_data.csv")

# Step 3: Display the dataframe
display(df)

order_id,order_date,store,product,category,quantity,price,customer_age
1001,2024-01-03,Bangalore,Apple iPhone 13,Electronics,1,699.0,34
1002,2024-01-04,Pune,Samsung Galaxy S21,Electronics,1,599.0,28
1003,2024-01-05,New Delhi,Logitech Mouse,Accessories,2,19.99,40
1004,2024-01-06,Bangalore,Nike Shoes,Footwear,1,120.5,26
1005,2024-01-06,Pune,Levi's Jeans,Apparel,1,49.99,31
1006,2024-01-07,Chennai,Apple AirPods,Electronics,1,129.0,29
1007,2024-01-08,Bangalore,Canon DSLR,Electronics,1,899.0,45
1008,2024-01-09,Pune,Adidas T-Shirt,Apparel,3,19.0,22
1009,2024-01-10,New Delhi,Dell Laptop,Electronics,1,999.0,37
1010,2024-01-11,Chennai,HP Printer,Electronics,1,199.99,50


What will happen?

- This will connect directly to your datacontainer in hexdatastoragegen2 using Key 1.

- It will load sales/sales_data.csv into a PySpark DataFrame.

- You’ll see a nice table in Databricks.

####Exploratory Data Analysis (EDA)

A — Check schema and sample

In [0]:
# Check structure of the DataFrame
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- store: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: double (nullable = true)
 |-- customer_age: integer (nullable = true)



In [0]:
# Show first 5 rows
df.show(5)

+--------+----------+---------+------------------+-----------+--------+-----+------------+
|order_id|order_date|    store|           product|   category|quantity|price|customer_age|
+--------+----------+---------+------------------+-----------+--------+-----+------------+
|    1001|2024-01-03|Bangalore|   Apple iPhone 13|Electronics|       1|699.0|          34|
|    1002|2024-01-04|     Pune|Samsung Galaxy S21|Electronics|       1|599.0|          28|
|    1003|2024-01-05|New Delhi|    Logitech Mouse|Accessories|       2|19.99|          40|
|    1004|2024-01-06|Bangalore|        Nike Shoes|   Footwear|       1|120.5|          26|
|    1005|2024-01-06|     Pune|      Levi's Jeans|    Apparel|       1|49.99|          31|
+--------+----------+---------+------------------+-----------+--------+-----+------------+
only showing top 5 rows


B — Basic statistics

In [0]:
# Summary statistics for all numeric columns
df.describe().display()

summary,order_id,store,product,category,quantity,price,customer_age
count,20.0,20,20,20,20.0,20.0,20.0
mean,1010.5,,,,1.6,245.921,33.5
stddev,5.916079783099616,,,,1.1424811411549587,316.1642249794276,8.82878068710808
min,1001.0,Bangalore,Adidas T-Shirt,Accessories,1.0,4.99,21.0
max,1020.0,Pune,Water Bottle,Home Appliances,5.0,999.0,50.0


In [0]:
# Count of rows
print("Total rows:", df.count())

Total rows: 20


In [0]:
# Count of rows with no missing values
print("Rows without nulls:", df.dropna().count())

Rows without nulls: 20


C — Column-level insights

In [0]:
from pyspark.sql.functions import col, countDistinct

# Distinct products
df.select(countDistinct("product")).display()

count(DISTINCT product)
20


In [0]:
# Total quantity sold per category
df.groupBy("category").sum("quantity").display()

category,sum(quantity)
Apparel,4
Electronics,10
Footwear,2
Home Appliances,4
Accessories,12


In [0]:
# Average price per category
df.groupBy("category").avg("price").display()

category,avg(price)
Apparel,34.495000000000005
Electronics,493.61
Footwear,105.245
Home Appliances,41.49333333333333
Accessories,17.9925


D — Simple visualization

In [0]:
# Total sales per store
df.groupBy("store").sum("price").display()

store,sum(price)
Bangalore,2126.49
Chennai,426.97
Pune,1231.97
New Delhi,1132.99


Databricks visualization. Run in Databricks to view.

E — Save as Delta Table for SQL Queries

In [0]:
# Save as managed Delta table
df.write.format("delta").mode("overwrite").saveAsTable("sales_delta")

In [0]:
%sql
SELECT store, SUM(price) AS total_sales
FROM sales_delta
GROUP BY store
ORDER BY total_sales DESC;

store,total_sales
Bangalore,2126.49
Pune,1231.97
New Delhi,1132.99
Chennai,426.97
