# <img src ='https://airsblobstorage.blob.core.windows.net/airstream/Asset 275.png' width="50px"> DBFS - Create and Query Table or DataFrame using DBFS

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

[link to tutorial](https://docs.microsoft.com/en-us/azure/databricks/data/databricks-file-system)

In [0]:
# List the Files in DBFS root using the %fs magic commands
%fs ls

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/local_disk0/,local_disk0/,0
dbfs:/mnt/,mnt/,0
dbfs:/tmp/,tmp/,0
dbfs:/user/,user/,0


In [0]:
# List the Files in DBFS root
dbutils.fs.ls("/")

In [0]:
#Using the display function to output what is in the DBFS root in a nicer format
display(
  dbutils.fs.ls("/")
)

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/local_disk0/,local_disk0/,0
dbfs:/mnt/,mnt/,0
dbfs:/tmp/,tmp/,0
dbfs:/user/,user/,0


In [0]:
dbutils.fs.ls("dbfs:/")

In [0]:
#using the display function and dbfs:/ root to make things look nicer
display(dbutils.fs.ls("dbfs:/"))

path,name,size
dbfs:/FileStore/,FileStore/,0
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-results/,databricks-results/,0
dbfs:/local_disk0/,local_disk0/,0
dbfs:/mnt/,mnt/,0
dbfs:/tmp/,tmp/,0
dbfs:/user/,user/,0


In [0]:
#Checking out what is mounted in the DBFS
display(
  dbutils.fs.ls("/mnt/")
)

path,name,size
dbfs:/mnt/datalake/,datalake/,0
dbfs:/mnt/training/,training/,0


In [0]:
#Location of hive meta store
display(dbutils.fs.ls("dbfs:/user/hive/warehouse"))

path,name,size
dbfs:/user/hive/warehouse/anpadill_microsoft_com_db.db/,anpadill_microsoft_com_db.db/,0
dbfs:/user/hive/warehouse/azurestream.db/,azurestream.db/,0
dbfs:/user/hive/warehouse/deltadb.db/,deltadb.db/,0
dbfs:/user/hive/warehouse/errors_data/,errors_data/,0
dbfs:/user/hive/warehouse/failure_data/,failure_data/,0
dbfs:/user/hive/warehouse/machines_data/,machines_data/,0
dbfs:/user/hive/warehouse/maint_data/,maint_data/,0
dbfs:/user/hive/warehouse/pageviews_by_second_example_tsv/,pageviews_by_second_example_tsv/,0
dbfs:/user/hive/warehouse/taxiservicewarehouse.db/,taxiservicewarehouse.db/,0
dbfs:/user/hive/warehouse/telemetry_data/,telemetry_data/,0


In [0]:
# This willl list out all the tables you have in your Hive Meta Store
spark.catalog.listTables()

In [0]:
display(
  dbutils.fs.ls("/databricks-datasets/airlines")
)

path,name,size
dbfs:/databricks-datasets/airlines/README.md,README.md,1089
dbfs:/databricks-datasets/airlines/_SUCCESS,_SUCCESS,0
dbfs:/databricks-datasets/airlines/part-00000,part-00000,67108879
dbfs:/databricks-datasets/airlines/part-00001,part-00001,67108862
dbfs:/databricks-datasets/airlines/part-00002,part-00002,67108930
dbfs:/databricks-datasets/airlines/part-00003,part-00003,67108804
dbfs:/databricks-datasets/airlines/part-00004,part-00004,67108908
dbfs:/databricks-datasets/airlines/part-00005,part-00005,67108890
dbfs:/databricks-datasets/airlines/part-00006,part-00006,67108825
dbfs:/databricks-datasets/airlines/part-00007,part-00007,67108880


In [0]:
# File location and type
file_location = "/databricks-datasets/adult/adult.data"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [0]:
# Create a view or table

temp_table_name = "AdultCensusIncome_csv"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `AdultCensusIncome_csv`

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14
39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "AdultCensusIncome_csv"

# df.write.format("parquet").saveAsTable(permanent_table_name)

In [0]:
display(
  dbutils.fs.ls("/databricks-datasets/iot-stream/data-device")
)

In [0]:
f = open("/dbfs/databricks-datasets/iot-stream/data-user/userData.csv", "r")
print(f.read())

In [0]:
# File location and type
file_location = "/FileStore/tables/medals-1.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7
Year,City,Sport,Discipline,NOC,Event,Event gender,Medal
1924,Chamonix,Skating,Figure skating,AUT,individual,M,Silver
1924,Chamonix,Skating,Figure skating,AUT,individual,W,Gold
1924,Chamonix,Skating,Figure skating,AUT,pairs,X,Gold
1924,Chamonix,Bobsleigh,Bobsleigh,BEL,four-man,M,Bronze
1924,Chamonix,Ice Hockey,Ice Hockey,CAN,ice hockey,M,Gold
1924,Chamonix,Biathlon,Biathlon,FIN,military patrol,M,Silver
1924,Chamonix,Skating,Figure skating,FIN,pairs,X,Silver
1924,Chamonix,Skating,Speed skating,FIN,10000m,M,Gold
1924,Chamonix,Skating,Speed skating,FIN,10000m,M,Silver


In [0]:
# Create a view or table

temp_table_name = "medals_csv"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `medals_csv`

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7
Year,City,Sport,Discipline,NOC,Event,Event gender,Medal
1924,Chamonix,Skating,Figure skating,AUT,individual,M,Silver
1924,Chamonix,Skating,Figure skating,AUT,individual,W,Gold
1924,Chamonix,Skating,Figure skating,AUT,pairs,X,Gold
1924,Chamonix,Bobsleigh,Bobsleigh,BEL,four-man,M,Bronze
1924,Chamonix,Ice Hockey,Ice Hockey,CAN,ice hockey,M,Gold
1924,Chamonix,Biathlon,Biathlon,FIN,military patrol,M,Silver
1924,Chamonix,Skating,Figure skating,FIN,pairs,X,Silver
1924,Chamonix,Skating,Speed skating,FIN,10000m,M,Gold
1924,Chamonix,Skating,Speed skating,FIN,10000m,M,Silver


In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "medals_csv"

df.write.format("parquet").saveAsTable(permanent_table_name)