# BlazingSQL Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import numpy as np
from blazingsql import BlazingContext

### Sample Data Table

In [2]:
df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['number', 'float_number', 'datetime', 'letter', 'category', 'word', 'string']
)

In [3]:
df_join = cudf.DataFrame({
    'cat': ['B', 'C', 'V']
    , 'means': [23.4, 71.4, 823.2]
})

In [4]:
bc = BlazingContext()

BlazingContext ready


In [5]:
bc.create_table('df', df)
bc.create_table('df2', df_join)

# SQL Joins

In [6]:
query = '''
    SELECT A.category
        , B.means
        , SUM(A.number) AS number_sum
        , AVG(A.increased) AS increased_avg
    FROM (
        SELECT category
            , number
            , float_number + 3.12 AS increased
        FROM df
        WHERE float_number > 0.93
    ) AS A
    INNER JOIN df2 AS B
        ON A.category = B.cat
    GROUP BY A.category
        , B.means
    ORDER BY category
'''

bc.sql(query)

Unnamed: 0,category,means,number_sum,increased_avg
0,B,23.4,48,10.48
1,C,71.4,42,7.33


#### INNER

In [7]:
query = '''
    SELECT *
    FROM df AS A
    INNER JOIN df2 AS B
        ON A.category = B.cat
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string,cat,means
0,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...,B,23.4
1,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...,C,71.4
2,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...,B,23.4
3,10,8.28,2020-10-03 03:31:21,W,B,python,,B,23.4


In [8]:
query = '''
    SELECT *
    FROM df
    INNER JOIN df2
        ON df.category = df2.cat
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string,cat,means
0,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...,B,23.4
1,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...,C,71.4
2,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...,B,23.4
3,10,8.28,2020-10-03 03:31:21,W,B,python,,B,23.4


#### FULL OUTER

In [9]:
query = '''
    SELECT *
    FROM df
    FULL OUTER JOIN df2
        ON df.category = df2.cat
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string,cat,means
0,,,,,,,,V,823.2
1,40.0,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...,B,23.4
2,42.0,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...,C,71.4
3,38.0,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...,B,23.4
4,10.0,8.28,2020-10-03 03:31:21,W,B,python,,B,23.4
5,39.0,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,,
6,11.0,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,,
7,31.0,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,...",,
8,33.0,9.26,2020-10-15 10:58:02,O,D,parallel,If you want to distribute your workflow across...,,
9,36.0,3.01,2020-09-30 14:36:26,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,,


#### LEFT OUTER

In [10]:
query = '''
    SELECT *
    FROM df
    LEFT JOIN df2
        ON df.category = df2.cat
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string,cat,means
0,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...,B,23.4
1,42,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...,C,71.4
2,38,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...,B,23.4
3,10,8.28,2020-10-03 03:31:21,W,B,python,,B,23.4
4,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,,
5,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,,
6,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,...",,
7,33,9.26,2020-10-15 10:58:02,O,D,parallel,If you want to distribute your workflow across...,,
8,36,3.01,2020-09-30 14:36:26,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,,
9,17,5.28,2020-10-09 08:34:40,P,D,dataframes,Dask is a flexible library for parallel comput...,,


In [11]:
query = '''
    SELECT *
    FROM df2
    LEFT JOIN df
        ON df.category = df2.cat
'''

bc.sql(query)

Unnamed: 0,cat,means,number,float_number,datetime,letter,category,word,string
0,B,23.4,40.0,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...
1,C,71.4,42.0,4.21,2020-10-01 10:02:23,U,C,GPUs,BlazingSQL provides a high-performance distrib...
2,V,823.2,,,,,,,
3,B,23.4,38.0,6.44,2020-10-10 08:34:36,X,B,csv,BlazingSQL lets you ETL raw data directly into...
4,B,23.4,10.0,8.28,2020-10-03 03:31:21,W,B,python,


#### CROSS JOIN

In [12]:
query = '''
    SELECT *
    FROM df
    CROSS JOIN df2
'''

bc.sql(query)

Unnamed: 0,number,float_number,datetime,letter,category,word,string,cat,means
0,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,B,23.4
1,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,C,71.4
2,39,6.88,2020-10-08 12:12:01,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,V,823.2
3,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,B,23.4
4,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,C,71.4
5,11,4.21,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,V,823.2
6,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,...",B,23.4
7,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,...",C,71.4
8,31,4.71,2020-10-10 09:26:43,U,D,memory,"cuDF allows for loading, joining, aggregating,...",V,823.2
9,40,0.93,2020-10-11 17:10:00,P,B,tabular,If your workflow is fast enough on a single GP...,B,23.4
