# BlazingSQL Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import numpy as np
from blazingsql import BlazingContext

### Sample Data Table

In [2]:
df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['number', 'float_number', 'datetime', 'letter', 'category', 'word', 'string']
)

In [3]:
bc = BlazingContext()

BlazingContext ready


In [4]:
bc.create_table('df', df)

# SQL

#### SELECT

In [5]:
query = '''
    SELECT *
    FROM df
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,W,B,python,


In [6]:
query = '''
    SELECT number
        , letter
        , category
    FROM df
'''

bc.sql(query)

Unnamed: 0,number,letter,category
0,39,C,D
1,11,A,D
2,31,U,D
3,40,P,B
4,33,O,D
5,42,U,C
6,36,T,D
7,38,X,B
8,17,P,D
9,10,W,B


#### DISTINCT

In [7]:
query = '''
    SELECT DISTINCT category
    FROM df
'''

bc.sql(query)

Unnamed: 0,category
0,B
1,C
2,D


In [8]:
query = '''
    SELECT DISTINCT category
        , letter
    FROM df
'''

bc.sql(query)

Unnamed: 0,category,letter
0,B,P
1,B,W
2,B,X
3,C,U
4,D,A
5,D,C
6,D,O
7,D,P
8,D,T
9,D,U


#### WHERE

In [9]:
query = '''
    SELECT *
    FROM df
    WHERE category = 'B'
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...
1,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...
2,10,8.28,2020-10-03 03:31:21,W,B,python,


In [10]:
query = '''
    SELECT *
    FROM df
    WHERE number > 36
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...
2,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...
3,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...


In [11]:
query = '''
    SELECT *
    FROM df
    WHERE number >= 36
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...
2,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...
3,36,3.01,2020-09-30 14:36:26,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
4,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...


#### GROUP BY

In [12]:
query = '''
    SELECT category
        , COUNT(*) AS cnt
    FROM df
    GROUP BY category
'''

bc.sql(query)

Unnamed: 0,category,cnt
0,C,1
1,B,3
2,D,6


In [13]:
query = '''
    SELECT category
        , COUNT(*) AS cnt
    FROM df
    GROUP BY category
'''

bc.sql(query)

Unnamed: 0,category,cnt
0,C,1
1,B,3
2,D,6


In [14]:
query = '''
    SELECT category
        , letter
        , COUNT(*) AS cnt
    FROM df
    GROUP BY category
        , letter
'''

bc.sql(query)

Unnamed: 0,category,letter,cnt
0,D,A,1
1,B,P,1
2,C,U,1
3,D,C,1
4,D,P,1
5,B,W,1
6,D,T,1
7,D,U,1
8,B,X,1
9,D,O,1


#### HAVING

In [15]:
query = '''
    SELECT category
        , COUNT(*) AS cnt
    FROM df
    GROUP BY category
    HAVING COUNT(*) > 1
'''

bc.sql(query)

Unnamed: 0,category,cnt
0,B,3
1,D,6


In [16]:
query = '''
    SELECT category
        , COUNT(*) AS cnt
        , STDDEV(CAST(number AS FLOAT)) AS var
    FROM df
    GROUP BY category
    HAVING STDDEV(CAST(number AS FLOAT)) > 12.0
'''

bc.sql(query)

Unnamed: 0,category,cnt,var
0,B,3,16.772995


#### ORDER BY

In [17]:
query = '''
    SELECT *
    FROM df
    ORDER BY category
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...
1,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...
2,10,8.28,2020-10-03 03:31:21,W,B,python,
3,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...
4,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
5,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
6,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
7,33,9.26,2020-10-15 10:58:02,O,D,parallel,If you want to distribute your workflow across...
8,36,3.01,2020-09-30 14:36:26,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
9,17,5.28,2020-10-09 08:34:40,P,D,dataframes,Dask is a flexible library for parallel comput...


In [18]:
query = '''
    SELECT *
    FROM df
    ORDER BY category DESC, number ASC
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
1,17,5.28,2020-10-09 08:34:40,P,D,dataframes,Dask is a flexible library for parallel comput...
2,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,33,9.26,2020-10-15 10:58:02,O,D,parallel,If you want to distribute your workflow across...
4,36,3.01,2020-09-30 14:36:26,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
5,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
6,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...
7,10,8.28,2020-10-03 03:31:21,W,B,python,
8,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...
9,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...


#### CASE

In [19]:
query = '''
    SELECT number
        , CASE WHEN number > 36 THEN 1 ELSE 0 END AS number_cat
    FROM df
'''

bc.sql(query)

Unnamed: 0,number,number_cat
0,39,1
1,11,0
2,31,0
3,40,1
4,33,0
5,42,1
6,36,0
7,38,1
8,17,0
9,10,0


In [20]:
query = '''
    SELECT CASE WHEN number > 36 THEN 1 ELSE 0 END AS number_cat
        , AVG(float_number) AS float_mean
    FROM df
    GROUP BY CASE WHEN number > 36 THEN 1 ELSE 0 END
    
'''

bc.sql(query)

Unnamed: 0,number_cat,float_mean
0,1,4.615
1,0,5.791667


#### LIKE

In [21]:
query = '''
    SELECT string
        , CASE WHEN string LIKE '%GPU%' THEN 1 ELSE 0 END AS Blazing
    FROM df
'''

bc.sql(query)

Unnamed: 0,string,Blazing
0,RAPIDS.ai is a suite of open-source libraries ...,1
1,cuDF is a Python GPU DataFrame (built on the A...,1
2,"cuDF allows for loading, joining, aggregating,...",0
3,If your workflow is fast enough on a single GP...,1
4,If you want to distribute your workflow across...,1
5,BlazingSQL provides a high-performance distrib...,0
6,BlazingSQL is built on the RAPIDS GPU data sci...,1
7,BlazingSQL lets you ETL raw data directly into...,1
8,Dask is a flexible library for parallel comput...,0
9,,0


In [22]:
query = '''
    SELECT *
    FROM df
    WHERE string LIKE 'BlazingS__%'
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...
1,36,3.01,2020-09-30 14:36:26,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
2,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...


#### CAST

In [23]:
query = '''
    SELECT CAST(number AS FLOAT) AS number_float
    FROM df
'''

bc.sql(query)

Unnamed: 0,number_float
0,39.0
1,11.0
2,31.0
3,40.0
4,33.0
5,42.0
6,36.0
7,38.0
8,17.0
9,10.0


In [24]:
query = '''
    SELECT float_number
        , CAST(float_number AS INTEGER) AS number_float
    FROM df
'''

bc.sql(query)

Unnamed: 0,float_number,number_float
0,6.88,6
1,4.21,4
2,4.71,4
3,0.93,0
4,9.26,9
5,4.21,4
6,3.01,3
7,6.44,6
8,5.28,5
9,8.28,8


#### LIMIT

In [25]:
query = '''
    SELECT *
    FROM df
    LIMIT 3
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string
0,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
