## Imports

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import *

## DB utils functions

In [0]:
# Which Files?
dbutils.fs.ls("/FileStore/tables/")

[FileInfo(path='dbfs:/FileStore/tables/BigMart_Sales.csv', name='BigMart_Sales.csv', size=869537, modificationTime=1740823896000),
 FileInfo(path='dbfs:/FileStore/tables/drivers.json', name='drivers.json', size=180812, modificationTime=1740825727000)]

## Reading different file formats

In [0]:
# CSV
df_csv = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("/FileStore/tables/BigMart_Sales.csv")

In [0]:
# JSON
df_json = spark.read.format("json")\
    .option("inferSchema", "true")\
    .option("header", "true")\
    .option("multiLine", "false")\
    .load("/FileStore/tables/drivers.json")
    

## Define own schema?!?

In [0]:
df_csv.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: double (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



In [0]:
#DDL

ddl_schema = """ 
                Item_Identifier STRING,
                Item_Weight STRING,
                Item_Fat_Content STRING,
                Item_Visibility DOUBLE,
                Item_Type STRING,
                Item_MRP DOUBLE,
                Outlet_Identifier STRING,
                Outlet_Establishment_Year INTEGER,
                Outlet_Size STRING,
                Outlet_Location_Type STRING,
                Outlet_Type STRING,
                Item_Outlet_Sales DOUBLE
"""

df_test = spark.read.format("csv")\
     .option("header", "true")\
     .schema(ddl_schema)\
     .load("/FileStore/tables/BigMart_Sales.csv")
         

In [0]:
df_test.printSchema()

root
 |-- Item_Identifier: string (nullable = true)
 |-- Item_Weight: string (nullable = true)
 |-- Item_Fat_Content: string (nullable = true)
 |-- Item_Visibility: double (nullable = true)
 |-- Item_Type: string (nullable = true)
 |-- Item_MRP: double (nullable = true)
 |-- Outlet_Identifier: string (nullable = true)
 |-- Outlet_Establishment_Year: integer (nullable = true)
 |-- Outlet_Size: string (nullable = true)
 |-- Outlet_Location_Type: string (nullable = true)
 |-- Outlet_Type: string (nullable = true)
 |-- Item_Outlet_Sales: double (nullable = true)



## Basic Transformations

In [0]:
# Select specific columns
df_csv.select(col("item_identifier"), col("item_weight"), col("item_fat_content"))

# Renaming
df_csv.withColumnRenamed("item_identifier", "Item_ID")

# Filter/ Where
df_csv.filter(col("item_identifier") == "FDA15")


Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
FDA15,9.3,Low Fat,0.016047301,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
FDA15,9.3,Low Fat,0.016054884,Dairy,250.2092,OUT045,2002,,Tier 2,Supermarket Type1,5976.2208
FDA15,9.3,Low Fat,0.01601936,Dairy,248.5092,OUT035,2004,Small,Tier 2,Supermarket Type1,6474.2392
FDA15,9.3,Low Fat,0.016087659,Dairy,249.6092,OUT018,2009,Medium,Tier 3,Supermarket Type2,5976.2208
FDA15,9.3,Low Fat,0.026818196,Dairy,248.9092,OUT010,1998,,Tier 3,Grocery Store,498.0184
FDA15,9.3,Low Fat,0.016009057,Dairy,250.6092,OUT013,1987,High,Tier 3,Supermarket Type1,6474.2392
FDA15,,Low Fat,0.015944801,Dairy,249.5092,OUT027,1985,Medium,Tier 3,Supermarket Type3,6474.2392
FDA15,9.3,LF,0.016113019,Dairy,248.8092,OUT017,2007,,Tier 2,Supermarket Type1,5976.2208
