# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import numpy as np

### Sample DataFrame

In [2]:
df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
df['category'] = df['category'].astype('category')

---

# Properties

---

## <span style="color:blue">DataFrame</span>

#### cudf.core.dataframe.DataFrame.at()

In [3]:
df.at[3] 

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...


In [4]:
df.at[3:7]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...


In [5]:
df.at[2, 'string']

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [6]:
df.at[2:5, 'string']

2    cuDF allows for loading, joining, aggregating,...
3    If your workflow is fast enough on a single GP...
4    If you want to distribute your workflow across...
5    BlazingSQL provides a high-performance distrib...
Name: string, dtype: object

In [7]:
df.at[2:5, ['string', 'float']]

Unnamed: 0,string,float
2,"cuDF allows for loading, joining, aggregating,...",4.71
3,If your workflow is fast enough on a single GP...,0.93
4,If you want to distribute your workflow across...,9.26
5,BlazingSQL provides a high-performance distrib...,4.21


#### cudf.core.dataframe.DataFrame.columns()

In [8]:
df.columns

Index(['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word',
       'string'],
      dtype='object')

#### cudf.core.dataframe.DataFrame.dtypes()

In [9]:
df.dtypes

num                   int64
float               float64
datetime      datetime64[s]
timedelta    timedelta64[s]
char                 object
category           category
word                 object
string               object
dtype: object

#### cudf.core.dataframe.DataFrame.iat()

In [10]:
df.iat[3]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...


In [11]:
df.iat[3:7]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...


In [12]:
df.iat[2, 7]

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [13]:
df.iat[2:5, 7]

2    cuDF allows for loading, joining, aggregating,...
3    If your workflow is fast enough on a single GP...
4    If you want to distribute your workflow across...
Name: string, dtype: object

In [14]:
df.iat[2:5, 6:8]

Unnamed: 0,word,string
2,memory,"cuDF allows for loading, joining, aggregating,..."
3,tabular,If your workflow is fast enough on a single GP...
4,parallel,If you want to distribute your workflow across...


In [15]:
df.iat[2:5, [1,3,5]]

Unnamed: 0,float,timedelta,category
2,4.71,0 days 03:35:09,D
3,0.93,0 days 02:54:26,B
4,9.26,0 days 09:52:38,D


#### cudf.core.dataframe.DataFrame.iloc()

In [16]:
df.iloc[3]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...


In [17]:
df.iloc[3:5]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [18]:
df.iloc[2, 7]

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [19]:
df.iloc[2:5, 7]

2    cuDF allows for loading, joining, aggregating,...
3    If your workflow is fast enough on a single GP...
4    If you want to distribute your workflow across...
Name: string, dtype: object

In [20]:
df.iloc[2:5, [4,5,7]]

Unnamed: 0,char,category,string
2,U,D,"cuDF allows for loading, joining, aggregating,..."
3,P,B,If your workflow is fast enough on a single GP...
4,O,D,If you want to distribute your workflow across...


In [21]:
df.iloc[[1,2,7], [4,5,6]]

Unnamed: 0,char,category,word
1,A,D,cuDF
2,U,D,memory
7,X,B,csv


#### cudf.core.dataframe.DataFrame.index()

In [22]:
df.index

RangeIndex(start=0, stop=10, step=1)

#### cudf.core.dataframe.DataFrame.loc()

In [23]:
df.loc[3]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...


In [24]:
df.loc[3:6]

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...


In [25]:
df.loc[2, 'string']

'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.'

In [26]:
df.loc[3:6, ['string', 'float']]

Unnamed: 0,string,float
3,If your workflow is fast enough on a single GP...,0.93
4,If you want to distribute your workflow across...,9.26
5,BlazingSQL provides a high-performance distrib...,4.21
6,BlazingSQL is built on the RAPIDS GPU data sci...,3.01


#### cudf.core.dataframe.DataFrame.ndim()

In [27]:
df.ndim

2

#### cudf.core.dataframe.DataFrame.shape()

In [28]:
df.shape

(10, 8)

#### cudf.core.dataframe.DataFrame.size()

In [29]:
df.size

80

#### cudf.core.dataframe.DataFrame.T()

In [30]:
df[['num']].T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
num,39,11,31,40,33,42,36,38,17,10


#### cudf.core.dataframe.DataFrame.values()

In [31]:
df[['num', 'float']].values

array([[39.  ,  6.88],
       [11.  ,  4.21],
       [31.  ,  4.71],
       [40.  ,  0.93],
       [33.  ,  9.26],
       [42.  ,  4.21],
       [36.  ,  3.01],
       [38.  ,  6.44],
       [17.  ,  5.28],
       [10.  ,  8.28]])

## <span style="color:blue">Series</span>

#### cudf.core.series.Series.cat()

In [32]:
df['category'].cat

<cudf.core.column.categorical.CategoricalAccessor at 0x7f19f82e7040>

#### cudf.core.series.Series.data()

In [33]:
df['num'].data

<cudf.core.buffer.Buffer at 0x7f19f82e7130>

#### cudf.core.series.Series.dt()

In [34]:
df['datetime'].dt

<cudf.core.series.DatetimeProperties at 0x7f19f9507520>

In [35]:
df['timedelta'].dt

<cudf.core.series.TimedeltaProperties at 0x7f19f9507880>

#### cudf.core.series.Series.dtype()

In [36]:
df['num'].dtype

dtype('int64')

#### cudf.core.series.Series.has_nulls()

In [37]:
df['num'].has_nulls

False

In [38]:
df['string'].has_nulls

True

#### cudf.core.series.Series.iloc()

In [39]:
df['num'].iloc[1]

11

In [40]:
df['num'].iloc[1:4]

1    11
2    31
3    40
Name: num, dtype: int64

#### cudf.core.series.Series.index()

In [41]:
df['num'].index

RangeIndex(start=0, stop=10, step=1)

#### cudf.core.series.Series.is_monotonic_decreasing()

In [42]:
df['num'].is_monotonic_decreasing

False

#### cudf.core.series.Series.is_monotonic_increasing()

In [43]:
df['num'].is_monotonic_decreasing

False

#### cudf.core.series.Series.is_monotonic()

In [44]:
df['num'].is_monotonic

False

#### cudf.core.series.Series.is_unique()

In [45]:
df['num'].is_unique

True

#### cudf.core.series.Series.loc()

In [46]:
df['num'].loc[3]

40

In [47]:
df['num'].loc[3:6]

3    40
4    33
5    42
6    36
Name: num, dtype: int64

#### cudf.core.series.Series.name()

In [48]:
df['float'].name

'float'

#### cudf.core.series.Series.ndim()

In [49]:
df['float'].ndim

1

#### cudf.core.series.Series.null_count()

In [50]:
df['float'].null_count

0

In [51]:
df['string'].null_count

1

#### cudf.core.series.Series.nullable()

In [52]:
df['num'].nullable

False

In [53]:
df['string'].nullable

True

#### cudf.core.series.Series.nullmask()

In [54]:
df['datetime'].nullmask

0     1021
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
dtype: int32

In [55]:
df['word'].nullmask

0     959
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
dtype: int32

#### cudf.core.series.Series.shape()

In [56]:
df['num'].shape

(10,)

#### cudf.core.series.Series.size()

In [57]:
df['float'].size

10

In [58]:
df['word'].size

10

#### cudf.core.series.Series.str()

In [59]:
df['word'].str

<cudf.core.column.string.StringMethods at 0x7f19f8265280>

#### cudf.core.series.Series.valid_count()

In [60]:
df['float'].valid_count

10

In [61]:
df['word'].valid_count

9

#### cudf.core.series.Series.values()

In [62]:
df['num'].values

array([39, 11, 31, 40, 33, 42, 36, 38, 17, 10])