# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import numpy as np
import pandas as pd

### Sample DataFrame

In [2]:
# pandas
pandas_df = pd.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
pandas_df['category'] = pandas_df['category'].astype('category')

In [3]:
#cudf
cudf_df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
cudf_df['category'] = cudf_df['category'].astype('category')

---

# Properties

---

## <span style="color:blue">DataFrame</span>

#### cudf.core.dataframe.DataFrame.at()

In [4]:
# pandas
pandas_df.at[2, 'string']

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [5]:
# cudf
cudf_df.at[2, 'string']

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

#### cudf.core.dataframe.DataFrame.columns()

In [6]:
# pandas
pandas_df.columns

Index(['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word',
       'string'],
      dtype='object')

In [7]:
#cudf
cudf_df.columns

Index(['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word',
       'string'],
      dtype='object')

#### cudf.core.dataframe.DataFrame.dtypes()

In [8]:
# pandas
pandas_df.dtypes

num                    int64
float                float64
datetime      datetime64[ns]
timedelta    timedelta64[ns]
char                  object
category            category
word                  object
string                object
dtype: object

In [9]:
# cudf
cudf_df.dtypes

num                   int64
float               float64
datetime      datetime64[s]
timedelta    timedelta64[s]
char                 object
category           category
word                 object
string               object
dtype: object

#### cudf.core.dataframe.DataFrame.iat()

In [10]:
# pandas
pandas_df.iat[2, 7]

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [11]:
# cudf
cudf_df.iat[2, 7]

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

#### cudf.core.dataframe.DataFrame.iloc()

In [12]:
# pandas
pandas_df.iloc[3]

num                                                         40
float                                                     0.93
datetime                                   2020-10-11 17:10:00
timedelta                                      0 days 02:54:26
char                                                         P
category                                                     B
word                                                   tabular
string       If your workflow is fast enough on a single GP...
Name: 3, dtype: object

In [13]:
# cudf
cudf_df.iloc[3]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...


In [14]:
pandas_df.iloc[3:5]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [15]:
# pandas
pandas_df.iloc[2, 7]

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [16]:
# cudf
cudf_df.iloc[2, 7]

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [17]:
# pandas
pandas_df.iloc[2:5, 7]

2    cuDF allows for loading, joining, aggregating,...
3    If your workflow is fast enough on a single GP...
4    If you want to distribute your workflow across...
Name: string, dtype: object

In [18]:
# cudf
cudf_df.iloc[2:5, 7]

2    cuDF allows for loading, joining, aggregating,...
3    If your workflow is fast enough on a single GP...
4    If you want to distribute your workflow across...
Name: string, dtype: object

In [19]:
# pandas
pandas_df.iloc[2:5, [4,5,7]]

Unnamed: 0,char,category,string
2,U,D,"cuDF allows for loading, joining, aggregating,..."
3,P,B,If your workflow is fast enough on a single GP...
4,O,D,If you want to distribute your workflow across...


In [20]:
# cudf
cudf_df.iloc[2:5, [4,5,7]]

Unnamed: 0,char,category,string
2,U,D,"cuDF allows for loading, joining, aggregating,..."
3,P,B,If your workflow is fast enough on a single GP...
4,O,D,If you want to distribute your workflow across...


In [21]:
# pandas
pandas_df.iloc[[1,2,7], [4,5,6]]

Unnamed: 0,char,category,word
1,A,D,cuDF
2,U,D,memory
7,X,B,csv


In [22]:
# cudf
cudf_df.iloc[[1,2,7], [4,5,6]]

Unnamed: 0,char,category,word
1,A,D,cuDF
2,U,D,memory
7,X,B,csv


#### cudf.core.dataframe.DataFrame.index()

In [23]:
# pandas
pandas_df.index

RangeIndex(start=0, stop=10, step=1)

In [24]:
# cudf
cudf_df.index

RangeIndex(start=0, stop=10, step=1)

#### cudf.core.dataframe.DataFrame.loc()

In [25]:
# pandas
pandas_df.loc[3]

num                                                         40
float                                                     0.93
datetime                                   2020-10-11 17:10:00
timedelta                                      0 days 02:54:26
char                                                         P
category                                                     B
word                                                   tabular
string       If your workflow is fast enough on a single GP...
Name: 3, dtype: object

In [26]:
# cudf
cudf_df.loc[3]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...


In [27]:
# pandas
pandas_df.loc[3:6]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...


In [28]:
# cudf
cudf_df.loc[3:6]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...


In [29]:
# pandas
pandas_df.loc[2, 'string']

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [30]:
# cudf
cudf_df.loc[2, 'string']

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [31]:
# pandas
pandas_df.loc[3:6, ['string', 'float']]

Unnamed: 0,string,float
3,If your workflow is fast enough on a single GP...,0.93
4,If you want to distribute your workflow across...,9.26
5,BlazingSQL provides a high-performance distrib...,4.21
6,BlazingSQL is built on the RAPIDS GPU data sci...,3.01


In [32]:
# cudf
cudf_df.loc[3:6, ['string', 'float']]

Unnamed: 0,string,float
3,If your workflow is fast enough on a single GP...,0.93
4,If you want to distribute your workflow across...,9.26
5,BlazingSQL provides a high-performance distrib...,4.21
6,BlazingSQL is built on the RAPIDS GPU data sci...,3.01


#### cudf.core.dataframe.DataFrame.ndim()

In [33]:
# pandas
pandas_df.ndim

2

In [34]:
# cudf
cudf_df.ndim

2

#### cudf.core.dataframe.DataFrame.shape()

In [35]:
# pandas
pandas_df.shape

(10, 8)

In [36]:
# cudf
cudf_df.shape

(10, 8)

#### cudf.core.dataframe.DataFrame.size()

In [37]:
# pandas
pandas_df.size

80

In [38]:
# cudf
cudf_df.size

80

#### cudf.core.dataframe.DataFrame.T()

In [39]:
# pandas
pandas_df[['num']].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
num,39,11,31,40,33,42,36,38,17,10


In [40]:
# cudf
cudf_df[['num']].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
num,39,11,31,40,33,42,36,38,17,10


#### cudf.core.dataframe.DataFrame.values()

In [41]:
# pandas
pandas_df[['num', 'float']].values

array([[39.  ,  6.88],
       [11.  ,  4.21],
       [31.  ,  4.71],
       [40.  ,  0.93],
       [33.  ,  9.26],
       [42.  ,  4.21],
       [36.  ,  3.01],
       [38.  ,  6.44],
       [17.  ,  5.28],
       [10.  ,  8.28]])

In [42]:
# cudf
cudf_df[['num', 'float']].values

array([[39.  ,  6.88],
       [11.  ,  4.21],
       [31.  ,  4.71],
       [40.  ,  0.93],
       [33.  ,  9.26],
       [42.  ,  4.21],
       [36.  ,  3.01],
       [38.  ,  6.44],
       [17.  ,  5.28],
       [10.  ,  8.28]])