# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import pandas as pd
import numpy as np

### Sample DataFrame

In [2]:
# pandas
pandas_df = pd.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
pandas_df['category'] = pandas_df['category'].astype('category')

In [3]:
# cudf
cudf_df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
cudf_df['category'] = cudf_df['category'].astype('category')

---

# Querying

---

## <span style="color:blue">DataFrame</span>

#### cudf.core.dataframe.DataFrame.head()

In [4]:
# pandas
pandas_df.head()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [5]:
# cudf
cudf_df.head()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [6]:
# pandas
pandas_df.head(2)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...


In [7]:
# cudf
cudf_df.head(2)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...


#### cudf.core.dataframe.DataFrame.info()

In [8]:
# pandas
pandas_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype          
---  ------     --------------  -----          
 0   num        10 non-null     int64          
 1   float      10 non-null     float64        
 2   datetime   9 non-null      datetime64[ns] 
 3   timedelta  9 non-null      timedelta64[ns]
 4   char       10 non-null     object         
 5   category   10 non-null     category       
 6   word       9 non-null      object         
 7   string     9 non-null      object         
dtypes: category(1), datetime64[ns](1), float64(1), int64(1), object(3), timedelta64[ns](1)
memory usage: 802.0+ bytes


In [9]:
# cudf
cudf_df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   num        10 non-null     int64
 1   float      10 non-null     float64
 2   datetime   9 non-null      datetime64[s]
 3   timedelta  9 non-null      timedelta64[s]
 4   char       10 non-null     object
 5   category   10 non-null     category
 6   word       9 non-null      object
 7   string     9 non-null      object
dtypes: category(1), datetime64[s](1), float64(1), int64(1), object(3), timedelta64[s](1)
memory usage: 1.7+ KB


#### cudf.core.dataframe.DataFrame.memory_usage()

In [10]:
# pandas
pandas_df.memory_usage()

Index        128
num           80
float         80
datetime      80
timedelta     80
char          80
category     114
word          80
string        80
dtype: int64

In [11]:
# cudf
cudf_df.memory_usage()

num            80
float          80
datetime      144
timedelta     144
char           54
category       29
word          160
string       1037
Index           0
dtype: int64

#### cudf.core.dataframe.DataFrame.nlargest()

In [12]:
# pandas
pandas_df.nlargest(3, ['num'])

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...


In [13]:
# cudf
cudf_df.nlargest(3, ['num'])

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...


#### cudf.core.dataframe.DataFrame.nsmallest()

In [14]:
# pandas
pandas_df.nsmallest(3, ['num'])

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...


In [15]:
# cudf
cudf_df.nsmallest(3, ['num'])

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...


#### cudf.core.dataframe.DataFrame.query()

In [16]:
# pandas
pandas_df.query('num == 10')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [17]:
# cudf
cudf_df.query('num == 10')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [18]:
# pandas
pandas_df.query('num >= 40')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...


In [19]:
# cudf
cudf_df.query('num >= 40')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...


#### cudf.core.dataframe.DataFrame.sample()

In [20]:
# pandas
pandas_df.sample()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...


In [21]:
# cudf
cudf_df.sample()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...


In [22]:
# pandas
pandas_df.sample(3)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [23]:
# cudf
cudf_df.sample(3)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
