
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/kgds-2.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

In [0]:
# Create a view or table

temp_table_name = "kgds-2_csv"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `kgds-2_csv`

In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "kgds-2_csv"

# df.write.format("parquet").saveAsTable(permanent_table_name)

In [0]:
# /FileStore/tables/kgds-2.csv

dbutils.fs.ls('/FileStore/tables/')

Out[3]: [FileInfo(path='dbfs:/FileStore/tables/kgds-1.csv', name='kgds-1.csv', size=5359, modificationTime=1732156878000),
 FileInfo(path='dbfs:/FileStore/tables/kgds-2.csv', name='kgds-2.csv', size=5359, modificationTime=1732156945000),
 FileInfo(path='dbfs:/FileStore/tables/kgds.csv', name='kgds.csv', size=5359, modificationTime=1732156828000)]

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Tutorial").getOrCreate()

In [0]:
# read the csv file

df = spark.read.format("csv").option("inferSchema", True).option("header",True).load('/FileStore/tables/kgds.csv')

df.display()
df.printSchema()


Rank,Player,Nation,Confederation,Goals,Caps,Goal_per_match,Career_span,Date_of_50th_goal
1,Cristiano Ronaldo,�Portugal,UEFA,135,217,0.62,2003-,26-Jun-14
2,Lionel Messi,�Argentina,CONMEBOL,112,190,0.59,2005-,29-Mar-16
3,Ali Daei,�Iran,AFC,108,148,0.73,1993-2006,9-Jan-00
4,Sunil Chhetri,�India,AFC,94,151,0.62,2005-2024,31-Dec-15
5,Mokhtar Dahari,�Malaysia,AFC,89,142,0.63,1972-1985,22-Aug-76
6,Ali Mabkhout,�United Arab Emirates,AFC,85,115,0.74,2009-,31-Aug-19
6,Romelu Lukaku,�Belgium,UEFA,85,120,0.71,2010-,10-Oct-19
8,Ferenc Pusk�s,�Hungary_Spain,UEFA,84,89,0.94,1945-1962,24-Jul-52
8,Robert Lewandowski,�Poland,UEFA,84,156,0.54,2008-,5-Oct-17
10,Godfrey Chitalu,�Zambia,CAF,79,111,0.71,1968-1980,7-Nov-78


root
 |-- Rank: integer (nullable = true)
 |-- Player: string (nullable = true)
 |-- Nation: string (nullable = true)
 |-- Confederation: string (nullable = true)
 |-- Goals: integer (nullable = true)
 |-- Caps: integer (nullable = true)
 |-- Goal_per_match: double (nullable = true)
 |-- Career_span: string (nullable = true)
 |-- Date_of_50th_goal: string (nullable = true)



In [0]:
my_ddl_schema = ''' rank Integer, player String, nation String, Confederation String, golas String,caps String, goals_per_match Integer, Career_span String, Date_of_50th_goal String '''

In [0]:

df = spark.read.format("csv").schema(my_ddl_schema).option("header",True).load('/FileStore/tables/kgds.csv')

df.display()
df.printSchema()


rank,player,nation,Confederation,golas,caps,goals_per_match,Career_span,Date_of_50th_goal
1,Cristiano Ronaldo,�Portugal,UEFA,135,217,,2003-,26-Jun-14
2,Lionel Messi,�Argentina,CONMEBOL,112,190,,2005-,29-Mar-16
3,Ali Daei,�Iran,AFC,108,148,,1993-2006,9-Jan-00
4,Sunil Chhetri,�India,AFC,94,151,,2005-2024,31-Dec-15
5,Mokhtar Dahari,�Malaysia,AFC,89,142,,1972-1985,22-Aug-76
6,Ali Mabkhout,�United Arab Emirates,AFC,85,115,,2009-,31-Aug-19
6,Romelu Lukaku,�Belgium,UEFA,85,120,,2010-,10-Oct-19
8,Ferenc Pusk�s,�Hungary_Spain,UEFA,84,89,,1945-1962,24-Jul-52
8,Robert Lewandowski,�Poland,UEFA,84,156,,2008-,5-Oct-17
10,Godfrey Chitalu,�Zambia,CAF,79,111,,1968-1980,7-Nov-78


root
 |-- rank: integer (nullable = true)
 |-- player: string (nullable = true)
 |-- nation: string (nullable = true)
 |-- Confederation: string (nullable = true)
 |-- golas: string (nullable = true)
 |-- caps: string (nullable = true)
 |-- goals_per_match: integer (nullable = true)
 |-- Career_span: string (nullable = true)
 |-- Date_of_50th_goal: string (nullable = true)



In [0]:
df = spark.read.format("csv").schema(my_ddl_schema).option("header",True).options(mode="PERMISSIVE").load('/FileStore/tables/kgds.csv')
df.display()

df_res = df.filter(df['Confederation'] =="AFC")
df_res2 = df.filter("golas=85")

df_res.display()
df_res2.display()

rank,player,nation,Confederation,golas,caps,goals_per_match,Career_span,Date_of_50th_goal
1,Cristiano Ronaldo,�Portugal,UEFA,135,217,,2003-,26-Jun-14
2,Lionel Messi,�Argentina,CONMEBOL,112,190,,2005-,29-Mar-16
3,Ali Daei,�Iran,AFC,108,148,,1993-2006,9-Jan-00
4,Sunil Chhetri,�India,AFC,94,151,,2005-2024,31-Dec-15
5,Mokhtar Dahari,�Malaysia,AFC,89,142,,1972-1985,22-Aug-76
6,Ali Mabkhout,�United Arab Emirates,AFC,85,115,,2009-,31-Aug-19
6,Romelu Lukaku,�Belgium,UEFA,85,120,,2010-,10-Oct-19
8,Ferenc Pusk�s,�Hungary_Spain,UEFA,84,89,,1945-1962,24-Jul-52
8,Robert Lewandowski,�Poland,UEFA,84,156,,2008-,5-Oct-17
10,Godfrey Chitalu,�Zambia,CAF,79,111,,1968-1980,7-Nov-78


rank,player,nation,Confederation,golas,caps,goals_per_match,Career_span,Date_of_50th_goal
3,Ali Daei,�Iran,AFC,108,148,,1993-2006,9-Jan-00
4,Sunil Chhetri,�India,AFC,94,151,,2005-2024,31-Dec-15
5,Mokhtar Dahari,�Malaysia,AFC,89,142,,1972-1985,22-Aug-76
6,Ali Mabkhout,�United Arab Emirates,AFC,85,115,,2009-,31-Aug-19
12,Hussein Saeed,�Iraq,AFC,78,137,,1977-1990,17-Mar-84
14,Kunishige Kamamoto,�Japan,AFC,75,76,,1964-1977,18-Jul-72
14,Bashar Abdullah,�Kuwait,AFC,75,134,,1996-2007,25-Dec-02
18,Majed Abdullah,�Saudi Arabia,AFC,72,117,,1977-1994,15-Apr-84
19,Kiatisuk Senamuang,�Thailand,AFC,71,134,,1993-2007,23-Jan-01
22,Piyapong Pue-on,�Thailand,AFC,70,100,,1981-1997,30-Jan-89


rank,player,nation,Confederation,golas,caps,goals_per_match,Career_span,Date_of_50th_goal
6,Ali Mabkhout,�United Arab Emirates,AFC,85,115,,2009-,31-Aug-19
6,Romelu Lukaku,�Belgium,UEFA,85,120,,2010-,10-Oct-19


In [0]:
# Reading spark data as SQL

df_table = df.createOrReplaceTempView("Table")

df_new = spark.sql("""
          select * from table where nation like '%India%' and golas>90
          """)

df_new.display()


rank,player,nation,Confederation,golas,caps,goals_per_match,Career_span,Date_of_50th_goal
4,Sunil Chhetri,�India,AFC,94,151,,2005-2024,31-Dec-15
