In [1]:
import json
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder.getOrCreate()

    Normally IPython only displays the output of the last statement. However it can be handy to run multiple sql magics in a single cell and see the output of each execution. Setting `ast_node_interactivity` to `all` will enable that.


In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [4]:
%load_ext ipython_magic.sparksql

In [5]:
%config SparkSql.cacheTTL=3600
%config SparkSql.outputFile="/tmp/sparkdb.schema.json"

In [6]:
df = spark.read.json("contacts.json")
df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- postalCode: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- streetAddress: string (nullable = true)
 |-- age: long (nullable = true)
 |-- first Name: string (nullable = true)
 |-- last Name: string (nullable = true)
 |-- phoneNumbers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- number: string (nullable = true)
 |    |    |-- type: string (nullable = true)



                                                                                

In [7]:
df.createOrReplaceTempView("CONTACTS_TABLE")

In [8]:
%sparksql SHOW TABLES

TTL 3600 seconds expired, re-generating schema file: /tmp/sparkdb.schema.json
Generating schema file: /tmp/sparkdb.schema.json
Schema file updated: /tmp/sparkdb.schema.json


0,1,2
database,tableName,isTemporary
,contacts_table,True


# Press tab to trigger auto completions and Ctrl-Q to format cell

In [9]:
%%sparksql --output grid --limit 1000
SELECT
    id,
    uuid()
FROM
    RANGE (1, 1000)


DataGrid(auto_fit_params={'area': 'all', 'padding': 30, 'numCols': None}, corner_renderer=None, default_render…

In [10]:
%%sparksql --output html --limit 3

SELECT
    con.`first Name`,
    con.phoneNumbers [ 0 ].type as primary_number,
    array_contains(con.phoneNumbers.type, 'home') as flag
FROM
    contacts_table AS con


0,1,2
first Name,primary_number,flag
Rack,home,True


In [11]:
%%sparksql --output json --limit 3
SELECT
    *
FROM
    contacts_table AS con


<IPython.core.display.JSON object>

# Create a temporary view with the --view option

In [12]:
%%sparksql --view the_exploded_table --output skip
SELECT
    *, 
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con


Created temporary view `the_exploded_table`
Updating local tables
Query execution skipped


In [13]:
%sparksql SHOW TABLES

0,1,2
database,tableName,isTemporary
,contacts_table,True
,the_exploded_table,True


# Use temporary view in subsequent queries with autocomplet suggestions

In [14]:
%%sparksql
SELECT
    *
FROM
    the_exploded_table AS the


0,1,2,3,4,5
address,age,first Name,last Name,phoneNumbers,phoneNumber
"Row(city='San Jone', postalCode='394221', state='CA', streetAddress='126')",24,Rack,Jackon,"[Row(number='7383627627', type='home')]","Row(number='7383627627', type='home')"


# Create a dataframe variable to use in pypark

In [15]:
%%sparksql --dataframe the_exploded_dataframe --output skip
SELECT
    *, 
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con


Captured dataframe to local variable `the_exploded_dataframe`
Query execution skipped


# Continue developing your query using dataframe API

In [16]:
the_exploded_dataframe.select('phoneNumber').show()

+------------------+
|       phoneNumber|
+------------------+
|{7383627627, home}|
+------------------+



# Edit SQL within python strings
## Enjoy the same functionality as a code cell
- syntax highlighting
- code completion
- SQL formatting

In [17]:
# declare a python string
sql = '''
--start-sparksql
SELECT
    *, 
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
--end-sparksql
'''
print(sql)


--start-sparksql
SELECT
    *, 
    explode(con.phoneNumbers) as phoneNumber
FROM
    contacts_table AS con
--end-sparksql



In [18]:
spark.sql(sql).show()

+--------------------+---+----------+---------+--------------------+------------------+
|             address|age|first Name|last Name|        phoneNumbers|       phoneNumber|
+--------------------+---+----------+---------+--------------------+------------------+
|{San Jone, 394221...| 24|      Rack|   Jackon|[{7383627627, home}]|{7383627627, home}|
+--------------------+---+----------+---------+--------------------+------------------+



In [19]:
%%sparksql?

[0;31mDocstring:[0m
::

  %sparksql [-l max_rows] [-r all|local|none] [-d name] [-c] [-e]
                [-v name] [-o sql|json|html|grid|skip|none]
                [sql [sql ...]]

Magic that works both as %sparksql and as %%sparksql

positional arguments:
  sql                   SQL statement to execute

optional arguments:
  -l max_rows, --limit max_rows
                        The maximum number of rows to display. A value of zero
                        is equivalent to `--output skip`
  -r <all|local|none>, --refresh <all|local|none>
                        Force the regeneration of the schema cache file. The
                        `local` option will only update tables/views created
                        in the local Spark context.
  -d name, --dataframe name
                        Capture dataframe in a local variable named `name`
  -c, --cache           Cache dataframe
  -e, --eager           Cache dataframe with eager load
  -v name, --view name  Create or replace a te

In [20]:
%sparksql --refresh all

Generating schema file: /tmp/sparkdb.schema.json
Schema file updated: /tmp/sparkdb.schema.json
Updating local tables
