# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import pandas as pd
import numpy as np

### Sample DataFrame

In [2]:
# pandas
pandas_df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
pandas_df['category'] = pandas_df['category'].astype('category')

In [3]:
# cudf
cudf_df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
cudf_df['category'] = cudf_df['category'].astype('category')

---

# Summarize

---

## <span style="color:blue">DataFrame</span>

#### cudf.core.dataframe.DataFrame.describe()

In [4]:
# pandas
pandas_df.describe()



Unnamed: 0,num,float,timedelta
count,10.0,10.0,9
mean,29.7,5.321,0 days 08:06:01.666666667
std,12.311241,2.481471,0 days 06:45:06.869167994
min,10.0,0.93,0 days 02:54:26
25%,20.5,4.21,0 days 03:59:38
50%,34.5,4.995,0 days 06:32:32
75%,38.75,6.77,0 days 08:28:52
max,42.0,9.26,1 days 01:02:51


In [5]:
# cudf
cudf_df.describe()



Unnamed: 0,num,float,timedelta
count,10.0,10.0,9
mean,29.7,5.321,0 days 08:06:01.666666667
std,12.311241,2.481471,0 days 06:45:06.869167994
min,10.0,0.93,0 days 02:54:26
25%,20.5,4.21,0 days 03:59:38
50%,34.5,4.995,0 days 06:32:32
75%,38.75,6.77,0 days 08:28:52
max,42.0,9.26,1 days 01:02:51


In [6]:
# pandas
pandas_df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])



Unnamed: 0,num,float,timedelta
count,10.0,10.0,9
mean,29.7,5.321,0 days 08:06:01.666666667
std,12.311241,2.481471,0 days 06:45:06.869167994
min,10.0,0.93,0 days 02:54:26
10%,10.9,2.802,0 days 03:27:00
25%,20.5,4.21,0 days 03:59:38
50%,34.5,4.995,0 days 06:32:32
75%,38.75,6.77,0 days 08:28:52
90%,40.2,8.378,0 days 12:54:40
max,42.0,9.26,1 days 01:02:51


In [7]:
# cudf
cudf_df.describe(percentiles=[0.1, 0.25, 0.5, 0.75, 0.9])



Unnamed: 0,num,float,timedelta
count,10.0,10.0,9
mean,29.7,5.321,0 days 08:06:01.666666667
std,12.311241,2.481471,0 days 06:45:06.869167994
min,10.0,0.93,0 days 02:54:26
10%,10.9,2.802,0 days 03:27:00
25%,20.5,4.21,0 days 03:59:38
50%,34.5,4.995,0 days 06:32:32
75%,38.75,6.77,0 days 08:28:52
90%,40.2,8.378,0 days 12:54:40
max,42.0,9.26,1 days 01:02:51


#### cudf.core.dataframe.DataFrame.groupby()

In [8]:
# pandas
pandas_df.groupby('category').count()

Unnamed: 0_level_0,num,float,datetime,timedelta,char,word,string
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
D,6,6,5,5,6,5,6
B,3,3,3,3,3,3,2
C,1,1,1,1,1,1,1


In [9]:
# cudf
cudf_df.groupby('category').count()

Unnamed: 0_level_0,num,float,datetime,timedelta,char,word,string
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
D,6,6,5,5,6,5,6
B,3,3,3,3,3,3,2
C,1,1,1,1,1,1,1


In [10]:
# pandas
pandas_df.groupby('category').agg({'num': 'sum', 'float': 'max', 'word': 'count'})

Unnamed: 0_level_0,num,float,word
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D,167,9.26,5
B,88,8.28,3
C,42,4.21,1


In [11]:
# cudf
cudf_df.groupby('category').agg({'num': 'sum', 'float': 'max', 'word': 'count'})

Unnamed: 0_level_0,num,float,word
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
D,167,9.26,5
B,88,8.28,3
C,42,4.21,1


#### cudf.core.dataframe.DataFrame.max()

In [12]:
# pandas
pandas_df[['num', 'float']].max()

num      42.00
float     9.26
dtype: float64

In [13]:
# cudf
cudf_df[['num', 'float']].max()

num      42.00
float     9.26
dtype: float64

In [14]:
# pandas
pandas_df[['num', 'float']].max(axis=1)

0    39.0
1    11.0
2    31.0
3    40.0
4    33.0
5    42.0
6    36.0
7    38.0
8    17.0
9    10.0
dtype: float64

In [15]:
# cudf
cudf_df[['num', 'float']].max(axis=1)

0    39.0
1    11.0
2    31.0
3    40.0
4    33.0
5    42.0
6    36.0
7    38.0
8    17.0
9    10.0
dtype: float64

#### cudf.core.dataframe.DataFrame.mean()

In [16]:
# pandas
pandas_df[['num', 'float']].mean()

num      29.700
float     5.321
dtype: float64

In [17]:
# cudf
cudf_df[['num', 'float']].mean()

num      29.700
float     5.321
dtype: float64

In [18]:
# pandas
pandas_df[['num', 'float']].mean(axis=1)

0    22.940
1     7.605
2    17.855
3    20.465
4    21.130
5    23.105
6    19.505
7    22.220
8    11.140
9     9.140
dtype: float64

In [19]:
# cudf
cudf_df[['num', 'float']].mean(axis=1)

0    22.940
1     7.605
2    17.855
3    20.465
4    21.130
5    23.105
6    19.505
7    22.220
8    11.140
9     9.140
dtype: float64

#### cudf.core.dataframe.DataFrame.min()

In [20]:
# pandas
pandas_df[['num', 'float']].min()

num      10.00
float     0.93
dtype: float64

In [21]:
# cudf
cudf_df[['num', 'float']].min()

num      10.00
float     0.93
dtype: float64

In [22]:
# pandas
pandas_df[['num', 'float']].min(axis=1)

0    6.88
1    4.21
2    4.71
3    0.93
4    9.26
5    4.21
6    3.01
7    6.44
8    5.28
9    8.28
dtype: float64

In [23]:
# cudf
cudf_df[['num', 'float']].min(axis=1)

0    6.88
1    4.21
2    4.71
3    0.93
4    9.26
5    4.21
6    3.01
7    6.44
8    5.28
9    8.28
dtype: float64

#### cudf.core.dataframe.DataFrame.quantile()

In [24]:
# pandas
pandas_df[['num', 'float']].quantile()

num      34.500
float     4.995
Name: 0.5, dtype: float64

In [25]:
# cudf
cudf_df[['num', 'float']].quantile()

num      34.500
float     4.995
Name: 0.5, dtype: float64

In [26]:
# pandas
pandas_df[['num', 'float']].quantile(.25)

num      20.50
float     4.21
Name: 0.25, dtype: float64

In [27]:
# cudf
cudf_df[['num', 'float']].quantile(.25)

num      20.50
float     4.21
Name: 0.25, dtype: float64

#### cudf.core.dataframe.DataFrame.std()

In [28]:
# pandas
pandas_df[['num', 'float']].std()

num      12.311241
float     2.481471
dtype: float64

In [29]:
# cudf
cudf_df[['num', 'float']].std()

num      12.311241
float     2.481471
dtype: float64

In [30]:
# pandas
pandas_df[['num', 'float']].std(axis=1)

0    22.712270
1     4.801255
2    18.589837
3    27.626662
4    16.786715
5    26.721565
6    23.327453
7    22.316290
8     8.287291
9     1.216224
dtype: float64

In [31]:
# cudf
cudf_df[['num', 'float']].std(axis=1)

0    22.712270
1     4.801255
2    18.589837
3    27.626662
4    16.786715
5    26.721565
6    23.327453
7    22.316290
8     8.287291
9     1.216224
dtype: float64

#### cudf.core.dataframe.DataFrame.sum()

In [32]:
# pandas
pandas_df[['num', 'float']].sum()

num      297.00
float     53.21
dtype: float64

In [33]:
# cudf
cudf_df[['num', 'float']].sum()

num      297.00
float     53.21
dtype: float64

In [34]:
# pandas
pandas_df[['num', 'float']].sum(axis=1)

0    45.88
1    15.21
2    35.71
3    40.93
4    42.26
5    46.21
6    39.01
7    44.44
8    22.28
9    18.28
dtype: float64

In [35]:
# cudf
cudf_df[['num', 'float']].sum(axis=1)

0    45.88
1    15.21
2    35.71
3    40.93
4    42.26
5    46.21
6    39.01
7    44.44
8    22.28
9    18.28
dtype: float64