<div  style="text-align: center; line-height: 0; padding-top: 9px;">
  <img src="../Includes/images/bookstore_schema.png" alt="Databricks Learning" style="width: 600">
</div>

In [0]:
%run ../Includes/Copy-Datasets

In [0]:
%python
files = dbutils.fs.ls(f"{dataset_bookstore}/customers-json")
display(files)


In [0]:
SELECT * FROM JSON.`${dataset.bookstore}/customers-json/export_001.json`

In [0]:
SELECT * FROM JSON.`${dataset.bookstore}/customers-json/export_*.json`

In [0]:
SELECT * FROM JSON.`${dataset.bookstore}/customers-json/`

In [0]:
SELECT COUNT(*) FROM JSON.`${dataset.bookstore}/customers-json/`

In [0]:
SELECT 
  *, input_file_name() source_file 
FROM JSON.`${dataset.bookstore}/customers-json/`

In [0]:
SELECT * FROM TEXT.`${dataset.bookstore}/customers-json/`

In [0]:
SELECT * FROM binaryFile.`${dataset.bookstore}/customers-json/`

In [0]:
SELECT * FROM CSV.`${dataset.bookstore}/books-csv/`

In [0]:
CREATE TABLE book_csv 
(book_id STRING, title STRING, author STRING, category STRING, price DOUBLE)
USING CSV
OPTIONS (
  header "true",
  delimiter ";"
)
LOCATION "${dataset.bookstore}/books-csv/"

In [0]:
SELECT * FROM book_csv;

In [0]:
DESCRIBE EXTENDED book_csv;

In [0]:
%python
files = dbutils.fs.ls(f"{dataset_bookstore}/books-csv")
display(files)

In [0]:
%python
(
    spark.read
    .table("book_csv")
    .write
    .format("csv")
    .mode("append")
    .option("header", "true")
    .option("delimiter", ";")
    .save(f"{dataset_bookstore}/books-csv")
)

In [0]:
%python
files = dbutils.fs.ls(f"{dataset_bookstore}/books-csv")
display(files)

In [0]:
SELECT COUNT(*) FROM book_csv;

In [0]:
REFRESH TABLE book_csv;

In [0]:
SELECT COUNT(*) FROM book_csv;

In [0]:
CREATE TABLE customers AS
SELECT * FROM JSON.`${dataset.bookstore}/customers-json/`;
DESCRIBE EXTENDED customers;

In [0]:
drop table books_unparsed

In [0]:
CREATE TABLE books_unparsed AS
SELECT * FROM CSV.`${dataset.bookstore}/books-csv/`;
SELECT * FROM books_unparsed;

In [0]:
CREATE TEMP VIEW books_tmp_vw
(book_id STRING, title STRING, author STRING, category STRING, price DOUBLE)
USING CSV
OPTIONS (
  path = "${dataset.bookstore}/books-csv/",
  header = "true",
  delimiter = ";"
);
CREATE TABLE books AS 
SELECT * FROM books;

In [0]:
DESCRIBE EXTENDED books;

## Simplified File Querying

In [0]:
/*
Databricks recently introduced a new function called read_files that makes it easier to query CSV files and other file formats directly, without needing to first create a temporary view.
*/
SELECT * FROM read_files(
  '${dataset_bookstore}/books-csv/export_*.csv',
  format => 'csv',
  header => 'true',
  delimiter => ';'
);

In [0]:
/*
Now, we can create our books delta table directly from these files using a CTAS statement:
*/
CREATE TABLE books
AS SELECT * FROM read_files(
    '${dataset_bookstore}/books-csv/export_*.csv',
    format => 'csv',
    header => 'true',
    delimiter => ';'
);
/*
The read_files function automatically tries to infer a unified schema from all the source files. If any value doesn’t match the expected schema, it's stored in an extra column called _rescued_data as a JSON string.
*/

## The _metadata Column

In [0]:
/*
The input_file_name() function is no longer supported in newer versions of the Databricks Runtime. As an alternative, you can use the _metadata.file_path attribute to retrieve the file path information.
*/
SELECT *,
       _metadata.file_path AS source_file
FROM json.`${dataset.bookstore}/customers-json`;
/*
By leveraging the _metadata column, you can access various details about your input files, such as:

_metadata.file_path: The full path to the input file.

_metadata.file_name: The name of the file, including its extension.

_metadata.file_size: The size of the file in bytes.

_metadata.file_modification_time: The timestamp of the last modification made to the file.
*/