# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import numpy as np

### Sample DataFrame

In [2]:
df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
df['category'] = df['category'].astype('category')

---

# Functions

---

## <span style="color:blue">String functions</span>

#### cudf.core.column.string.StringMethods.contains()

In [3]:
df['string'].str.contains('GPU')

0     True
1     True
2    False
3     True
4     True
5    False
6     True
7     True
8    False
9     <NA>
Name: string, dtype: bool

In [4]:
df['string'].str.contains('\.+')

0     True
1    False
2     True
3     True
4    False
5    False
6    False
7    False
8    False
9     <NA>
Name: string, dtype: bool

In [5]:
df['string'].str.contains('[a-z]+flow')

0    False
1    False
2    False
3     True
4     True
5    False
6    False
7    False
8    False
9     <NA>
Name: string, dtype: bool

#### cudf.core.column.string.StringMethods.extract()

In [6]:
df['string'].str.extract('(cuDF)')

Unnamed: 0,0
0,
1,cuDF
2,cuDF
3,cuDF
4,cuDF
5,
6,
7,
8,
9,


In [7]:
df['string'].str.extract('([a-z]+flow)')

Unnamed: 0,0
0,
1,
2,
3,workflow
4,workflow
5,
6,
7,
8,
9,


#### cudf.core.column.string.StringMethods.findall()

In [8]:
df['string'].str.findall('(cuDF)')

Unnamed: 0,0
0,
1,cuDF
2,cuDF
3,cuDF
4,cuDF
5,
6,
7,
8,
9,


In [9]:
df['string'].str.findall('([a-z]+flow)')

Unnamed: 0,0
0,
1,
2,
3,workflow
4,workflow
5,
6,
7,
8,
9,


In [10]:
df['string'].str.findall('(GPU)')

Unnamed: 0,0,1
0,GPU,
1,GPU,
2,,
3,GPU,GPU
4,GPU,GPU
5,,
6,GPU,
7,GPU,GPU
8,,
9,,


#### cudf.core.column.string.StringMethods.len()

In [11]:
df['string'].str.len()

0     129
1      81
2     126
3     150
4     171
5      71
6      60
7      82
8      59
9    <NA>
Name: string, dtype: int32

#### cudf.core.column.string.StringMethods.lower()

In [12]:
df['string'].str.lower()

0    rapids.ai is a suite of open-source libraries ...
1    cudf is a python gpu dataframe (built on the a...
2    cudf allows for loading, joining, aggregating,...
3    if your workflow is fast enough on a single gp...
4    if you want to distribute your workflow across...
5    blazingsql provides a high-performance distrib...
6    blazingsql is built on the rapids gpu data sci...
7    blazingsql lets you etl raw data directly into...
8    dask is a flexible library for parallel comput...
9                                                 <NA>
Name: string, dtype: object

#### cudf.core.column.string.StringMethods.match()

In [13]:
df['word'].str.match('(c)+')

0    False
1     True
2    False
3    False
4    False
5    False
6     <NA>
7     True
8    False
9    False
Name: word, dtype: bool

#### cudf.core.column.string.StringMethods.ngrams_tokenize()

In [14]:
df['string'].str.ngrams_tokenize(n=2, separator='_')

0            RAPIDS.ai_is
1                    is_a
2                 a_suite
3                suite_of
4          of_open-source
              ...        
143           library_for
144          for_parallel
145    parallel_computing
146          computing_in
147             in_Python
Name: string, Length: 148, dtype: object

#### cudf.core.column.string.StringMethods.pad()

In [15]:
df['word'].str.pad(width=10)

0          data
1          cuDF
2        memory
3       tabular
4      parallel
5          GPUs
6          <NA>
7           csv
8    dataframes
9        python
Name: word, dtype: object

In [16]:
df['word'].str.pad(width=10, side='right', fillchar='#')

0    data######
1    cuDF######
2    memory####
3    tabular###
4    parallel##
5    GPUs######
6          <NA>
7    csv#######
8    dataframes
9    python####
Name: word, dtype: object

In [17]:
df['word'].str.pad(width=10, side='both', fillchar='-')

0    ---data---
1    ---cuDF---
2    --memory--
3    -tabular--
4    -parallel-
5    ---GPUs---
6          <NA>
7    ---csv----
8    dataframes
9    --python--
Name: word, dtype: object

#### cudf.core.column.string.StringMethods.replace()

In [18]:
df['word'].str.replace('da..', 'tada')

0          tada
1          cuDF
2        memory
3       tabular
4      parallel
5          GPUs
6          <NA>
7           csv
8    tadaframes
9        python
Name: word, dtype: object

In [19]:
df['word'].str.replace('da..', 'tada', regex=False)

0          data
1          cuDF
2        memory
3       tabular
4      parallel
5          GPUs
6          <NA>
7           csv
8    dataframes
9        python
Name: word, dtype: object

#### cudf.core.column.string.StringMethods.split()

In [20]:
df['string'].str.split(' ')

0    [RAPIDS.ai, is, a, suite, of, open-source, lib...
1    [cuDF, is, a, Python, GPU, DataFrame, (built, ...
2    [cuDF, allows, for, loading,, joining,, aggreg...
3    [If, your, workflow, is, fast, enough, on, a, ...
4    [If, you, want, to, distribute, your, workflow...
5    [BlazingSQL, provides, a, high-performance, di...
6    [BlazingSQL, is, built, on, the, RAPIDS, GPU, ...
7    [BlazingSQL, lets, you, ETL, raw, data, direct...
8    [Dask, is, a, flexible, library, for, parallel...
9                                                 None
Name: string, dtype: list

In [21]:
df['string'].str.split(' ', n=4)

0    [RAPIDS.ai, is, a, suite, of open-source libra...
1    [cuDF, is, a, Python, GPU DataFrame (built on ...
2    [cuDF, allows, for, loading,, joining, aggrega...
3    [If, your, workflow, is, fast enough on a sing...
4    [If, you, want, to, distribute your workflow a...
5    [BlazingSQL, provides, a, high-performance, di...
6    [BlazingSQL, is, built, on, the RAPIDS GPU dat...
7    [BlazingSQL, lets, you, ETL, raw data directly...
8    [Dask, is, a, flexible, library for parallel c...
9                                                 None
Name: string, dtype: list

#### cudf.core.column.string.StringMethods.subword_tokenize()

In [22]:
tokens, masks, metadata = df['string'].str.subword_tokenize('hash.txt')

In [23]:
tokens, masks, metadata = df['string'].str.subword_tokenize('hash.txt', max_length=10, stride=10)

In [24]:
tokens, masks, metadata = df['string'].str.subword_tokenize('hash.txt', do_lower=True)

#### cudf.core.column.string.StringMethods.upper()

In [25]:
df['string'].str.upper()

0    RAPIDS.AI IS A SUITE OF OPEN-SOURCE LIBRARIES ...
1    CUDF IS A PYTHON GPU DATAFRAME (BUILT ON THE A...
2    CUDF ALLOWS FOR LOADING, JOINING, AGGREGATING,...
3    IF YOUR WORKFLOW IS FAST ENOUGH ON A SINGLE GP...
4    IF YOU WANT TO DISTRIBUTE YOUR WORKFLOW ACROSS...
5    BLAZINGSQL PROVIDES A HIGH-PERFORMANCE DISTRIB...
6    BLAZINGSQL IS BUILT ON THE RAPIDS GPU DATA SCI...
7    BLAZINGSQL LETS YOU ETL RAW DATA DIRECTLY INTO...
8    DASK IS A FLEXIBLE LIBRARY FOR PARALLEL COMPUT...
9                                                 <NA>
Name: string, dtype: object

## <span style="color:blue">Categorical functions</span>

#### cudf.core.column.categorical.CategoricalAccessor.add_categories()

In [26]:
df['category'].cat.add_categories(['A', 'E'])

0    D
1    D
2    D
3    B
4    D
5    C
6    D
7    B
8    D
9    B
Name: category, dtype: category
Categories (5, object): ['B', 'C', 'D', 'A', 'E']

In [27]:
df['category'].cat.add_categories(['A', 'E'], inplace=True)
df['category']

0    D
1    D
2    D
3    B
4    D
5    C
6    D
7    B
8    D
9    B
Name: category, dtype: category
Categories (5, object): ['B', 'C', 'D', 'A', 'E']

#### cudf.core.column.categorical.CategoricalAccessor.categories()

In [28]:
df['category'].cat.categories

StringIndex(['B' 'C' 'D' 'A' 'E'], dtype='object')

#### cudf.core.column.categorical.CategoricalAccessor.remove_categories()

In [29]:
df['category'].cat.remove_categories(['A', 'E'])

0    D
1    D
2    D
3    B
4    D
5    C
6    D
7    B
8    D
9    B
Name: category, dtype: category
Categories (3, object): ['B', 'C', 'D']

In [30]:
df['category'].cat.remove_categories(['A', 'E'], inplace=True)

## <span style="color:blue">Date and time functions</span>

#### cudf.core.series.DatetimeProperties.day()

In [31]:
df['datetime'].dt.day

0       8
1    <NA>
2      10
3      11
4      15
5       1
6      30
7      10
8       9
9       3
Name: datetime, dtype: int16

#### cudf.core.series.DatetimeProperties.dayofweek()

In [32]:
df['datetime'].dt.dayofweek

0       3
1    <NA>
2       5
3       6
4       3
5       3
6       2
7       5
8       4
9       5
Name: datetime, dtype: int16

#### cudf.core.series.DatetimeProperties.year()

In [33]:
df['datetime'].dt.year

0    2020
1    <NA>
2    2020
3    2020
4    2020
5    2020
6    2020
7    2020
8    2020
9    2020
Name: datetime, dtype: int16

## <span style="color:blue">Mathematical and statistical functions</span>

#### cudf.core.dataframe.DataFrame.corr()

In [34]:
df[['num', 'float']].corr()

Unnamed: 0,num,float
num,1.0,-0.270802
float,-0.270802,1.0


#### cudf.core.dataframe.DataFrame.cumsum()

In [35]:
df['num'].cumsum()

0     39
1     50
2     81
3    121
4    154
5    196
6    232
7    270
8    287
9    297
Name: num, dtype: int64