In [3]:
import findspark
findspark.init()
import json
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [None]:
spark = SparkSession.builder.getOrCreate()

In [5]:
%load_ext sparksql

The sparksql extension is already loaded. To reload it, use:
  %reload_ext sparksql


In [9]:
%%sparksql
CREATE TABLE student (id INT, name STRING, age INT)
USING
    PARQUET


In [None]:
%sparksql INSERT INTO student (id, name, age) VALUES(1, 'Alice', 11)
%sparksql INSERT INTO student (id, name, age) VALUES(2, 'Bob', 12)
%sparksql INSERT INTO student (id, name, age) VALUES(3, 'Eve', 13)

In [11]:
%sparksql  SELECT * FROM student

only showing top 2 row(s)


0,1,2
id,name,age
1,Alice,11
2,Bob,12


In [13]:
%config SparkSql.limit=2
%config SparkSql.cacheTTL=20

In [14]:
%%sparksql -c -v MY_VIEW -d myDf
SELECT * FROM student

TTL 20 seconds expired, re-generating schema file: /tmp/sparkdb.schema.json
Schema file updated: /tmp/sparkdb.schema.json
cache dataframe with lazy load
create temporary view `MY_VIEW`
capture dataframe to local variable `myDf`
only showing top 2 row(s)


0,1,2
id,name,age
1,Alice,11
3,Eve,13


In [15]:
myDf.show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 11|
|  3|  Eve| 13|
|  2|  Bob| 12|
+---+-----+---+



In [16]:
spark.table('MY_VIEW').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 11|
|  3|  Eve| 13|
|  2|  Bob| 12|
+---+-----+---+



In [17]:
%%sparksql
-- the sql-language-server uses the schema file to suggest
-- code completions. While editing press tab to trigger auto completion.
SELECT
    s.id,
    s.name,
    s.age
FROM
    student AS s

only showing top 2 row(s)


0,1,2
id,name,age
1,Alice,11
3,Eve,13


In [12]:
# declare a python string
sql = '''
--start-sparksql
SELECT
    *
FROM
    student AS s
--end-sparksql
'''
print(sql)


--start-sql-syntax
SELECT
    *
FROM
    student AS s
--end-sql-syntax



In [13]:
spark.sql(sql).show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 11|
|  2|  Bob| 12|
|  3|  Eve| 13|
+---+-----+---+



### Displaying the output of multiple line magic in a cell
Normally IPython only displays the output of the last statement. However it can be handy to run multiple sql magics in a single cell and see the output of each execution. Setting `ast_node_interactivity` to `all` will enable that.

In [18]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [19]:
print('Results where age below 12')
%sparksql SELECT * FROM student WHERE age < 12
print('Results where age above 12')
%sparksql SELECT * FROM student WHERE age >= 12

Results where age below 12


0,1,2
id,name,age
1,Alice,11


Results where age above 12


0,1,2
id,name,age
3,Eve,13
2,Bob,12


In [20]:
%%sparksql?

[0;31mDocstring:[0m
::

  %sparksql [-d DATAFRAME] [-c] [-e] [-v VIEW] [-l LIMIT] [-f OUTPUTFILE]
                [-t CACHETTL]
                [sql ...]

Magic that works both as %sparksql and as %%sparksql

positional arguments:
  sql                   SQL statement

optional arguments:
  -d DATAFRAME, --dataframe DATAFRAME
                        Capture dataframe in a local variable
  -c, --cache           Cache dataframe
  -e, --eager           Cache dataframe with eager load
  -v VIEW, --view VIEW  Create or replace temporary view
  -l LIMIT, --limit LIMIT
                        The maximum number of rows to display
  -f OUTPUTFILE, --outputFile OUTPUTFILE
                        Output schema to specified file, defaults to
                        /tmp/sparkdb.schema.json
  -t CACHETTL, --cacheTTL CACHETTL
                        Re-generate output schema file if older than time
                        specified (defaults to 3600 seconds)
[0;31mFile:[0m      ~/jupyter-ext/sp

### Predefined Configuration

SparkSql magic can be pre-configured using the ipython profile_default. For example you can set the location of the schema file output location. To do this edit the ipython_config.py file.


```bash
$ cat ~/.ipython/profile_default/ipython_config.py 

# get the config
c = get_config()

# pre-load the sparksql magic
c.InteractiveShellApp.extensions = [
    'sparksql'
]

# pre-configure the SparkSql magic.
c.SparkSql.limit=20
c.SparkSql.cacheTTL=3600
c.SparkSql.outputFile='/tmp/sparkdb.schema.json'


# pre-configure to display all cell outputs in notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
```