# DATA READING

In [0]:
df = spark.read.format('parquet')\
    .option('inferSchema', True)\
    .load('abfss://bronze-staging@datalake.dfs.core.windows.net/rawdata')

## DATA TRANSFORMATION

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
df = df.withColumn('model_category',split(col('Model_ID'),'-')[0])

In [0]:
#This code changes the data type of the 'Units_Sold' column from BigINT to string
df.withColumn('Units_Sold',col('Units_Sold').cast(StringType())).display()

In [0]:
#This box prints the schema:
df.withColumn('Units_Sold',col('Units_Sold').cast(StringType())).printSchema()

root
 |-- Branch_ID: string (nullable = true)
 |-- Dealer_ID: string (nullable = true)
 |-- Model_ID: string (nullable = true)
 |-- Revenue: long (nullable = true)
 |-- Units_Sold: string (nullable = true)
 |-- Date_ID: string (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Year: integer (nullable = true)
 |-- BranchName: string (nullable = true)
 |-- DealerName: string (nullable = true)
 |-- model_category: string (nullable = true)



In [0]:
#new column calculating revenue per unit:
df = df.withColumn('Revenue_Per_Unit',col('Revenue')/col('Units_Sold'))

### AD-HOC

In [0]:
#this box groups sales by year and branch name, and sorts by year ascending and units sold descending
df.groupBy('Year','BranchName').agg(sum('Units_Sold').alias('Units_Sold')).sort('Year','Units_Sold',ascending=[1,0])

## Data Writing

In [0]:
df.write.format('parquet')\
    .mode('overwrite')\
    .option('path','abfss://silver-datawarehouse-transform@leawdatalake.dfs.core.windows.net/carsales')\
    .save()

## Query The Silver Data

In [0]:
%sql
SELECT 
  *
FROM parquet.`abfss://silver-datawarehouse@datalake.dfs.core.windows.net/carsales`