## Step 1: Define variables and load CSV file

In [0]:
catalog = "exercise"
schema = "exercise"
volume = "exercise"
download_url = "https://health.data.ny.gov/api/views/jxy9-yhdk/rows.csv"
file_name = "rows.csv"
table_name = "health_data_ny_gov"
path_volume = "/Volumes/" + catalog + "/" + schema + "/" + volume
path_table = catalog + "." + schema
print(path_table) # Show the complete path
print(path_volume) # Show the complete path

In [0]:
dbutils.fs.cp(f"{download_url}", f"{path_volume}/{file_name}")

## Step 2: Create a DataFrame

In [0]:
data = [[2021, "test", "Albany", "M", 42]]
columns = ["Year", "First_Name", "County", "Sex", "Count"]

df1 = spark.createDataFrame(data, schema="Year int, First_Name STRING, County STRING, Sex STRING, Count int")
display(df1) # The display() method is specific to Databricks notebooks and provides a richer visualization.
# df1.show() The show() method is a part of the Apache Spark DataFrame API and provides basic visualization.

## Step 3: Load data into a DataFrame from CSV file

In [0]:
df_csv = spark.read.csv(f"{path_volume}/{file_name}",
    header=True,
    inferSchema=True,
    sep=",")
display(df_csv)

## Step 4: View and interact with your DataFrame

In [0]:
df_csv.printSchema()
df1.printSchema()

## Rename column in the DataFrame

In [0]:
df_csv = df_csv.withColumnRenamed("First Name", "First_Name")
df_csv.printSchema

## Combine DataFrames

In [0]:
df = df1.union(df_csv)
display(df)

## Filter rows in a DataFrame

In [0]:
display(df.filter(df["Count"] > 200))

## Using .where() method

In [0]:
display(df.where(df["Count"] > 250))

## Select columns from a DataFrame and order by frequency

In [0]:
from pyspark.sql.functions import desc
display(df.select("First_Name", "Count").orderBy(desc("Count")))

## Create a subset DataFrame

In [0]:
subsetDF = df.filter((df["Year"] == 2009) & (df["Count"] > 100) & (df["Sex"] == "F")).select("First_Name", "County", "Count").orderBy(desc("Count"))
display(subsetDF)

## Step 5: Save the DataFrame to a table

In [0]:
df.write.mode("overwrite").saveAsTable(f"{path_table}.{table_name}")

## Save the DataFrame to JSON files

In [0]:
df.write.format("json").mode("overwrite").save("/exercise/json_data")

## Read the DataFrame from a JSON file

In [0]:
display(spark.read.format("json").json("/exercise/json_data"))

## Specify a column as a SQL query

In [0]:
display(df.selectExpr("Count", "upper(County) as big_name"))

## Use expr() to use SQL syntax for a column

In [0]:
from pyspark.sql.functions import expr
display(df.select("Count", expr("lower(County) as little_name")))

## Run an arbitrary SQL query using spark.sql() function

In [0]:
display(spark.sql(f"SELECT * FROM {path_table}.{table_name} limit 10"))