In [None]:
%%pyspark
df = spark.read.load('abfss://adworks-data@dssdemomtbanklake.dfs.core.windows.net/products.csv', format='csv'
## If header exists uncomment line below
, header=True
)
display(df.limit(10))

In [None]:
%%spark
val df = spark.read.format("csv").option("header", "true").load("abfss://adworks-data@dssdemomtbanklake.dfs.core.windows.net/products.csv")
display(df.limit(10))

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

productSchema = StructType([
    StructField("ProductID", IntegerType()),
    StructField("ProductName", StringType()),
    StructField("Category", StringType()),
    StructField("ListPrice", FloatType())
    ])

df = spark.read.load('abfss://adworks-data@dssdemomtbanklake.dfs.core.windows.net/products.csv',
    format='csv',
    schema=productSchema,
    header=False)
display(df.limit(10))

In [None]:
pricelist_df = df.select("ProductID", "ListPrice")

In [None]:
bikes_df = df.select("ProductName", "ListPrice").where((df["Category"]=="Mountain Bikes") | (df["Category"]=="Road Bikes"))
display(bikes_df)

In [None]:
counts_df = df.select("ProductID", "Category").groupBy("Category").count()
display(counts_df)

**Create view in Spark catalog**

In [8]:
df.createOrReplaceTempView("products")

StatementMeta(spool, 2, 10, Finished, Available)

**Use Spark SQL API to query data**

In [None]:
bikes_df = spark.sql("SELECT ProductID, ProductName, ListPrice \
                      FROM products \
                      WHERE Category IN ('Mountain Bikes', 'Road Bikes')")
display(bikes_df)

**Use SQL Code**

In [None]:
%%sql

SELECT Category, COUNT(ProductID) AS ProductCount
FROM products
GROUP BY Category
ORDER BY Category

**Use graphics package in code**

In [None]:
from matplotlib import pyplot as plt

# Get the data as a Pandas dataframe
data = spark.sql("SELECT Category, COUNT(ProductID) AS ProductCount \
                  FROM products \
                  GROUP BY Category \
                  ORDER BY Category").toPandas()

# Clear the plot area
plt.clf()

# Create a Figure
fig = plt.figure(figsize=(12,8))

# Create a bar plot of product counts by category
plt.bar(x=data['Category'], height=data['ProductCount'], color='orange')

# Customize the chart
plt.title('Product Counts by Category')
plt.xlabel('Category')
plt.ylabel('Products')
plt.grid(color='#95a5a6', linestyle='--', linewidth=2, axis='y', alpha=0.7)
plt.xticks(rotation=70)

# Show the plot area
plt.show()