# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import pandas as pd
import numpy as np

### Sample DataFrame

In [2]:
# pandas
pandas_df = pd.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
pandas_df['category'] = pandas_df['category'].astype('category')

In [3]:
# cudf
cudf_df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
cudf_df['category'] = cudf_df['category'].astype('category')

---

# Transforming

---

## <span style="color:blue">DataFrame</span>

#### cudf.core.dataframe.DataFrame.apply_rows()

In [4]:
# pandas
def pandas_regression(a, b, A_coeff, B_coeff, constant):
    return A_coeff * a + B_coeff * b + constant

pandas_df['output'] = pandas_df.apply(
    lambda row: pandas_regression(
        row['num']
        , row['float']
        , A_coeff=0.21
        , B_coeff=-2.82
        , constant=3.43
    ), axis=1)
pandas_df

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [5]:
# cudf
def cudf_regression(a, b, output, A_coeff, B_coeff, constant):
    for i, (aa, bb) in enumerate(zip(a,b)):
        output[i] = A_coeff * aa + B_coeff * bb + constant
        
cudf_df.apply_rows(
    cudf_regression
    , incols = {'num': 'a', 'float': 'b'}
    , outcols = {'output': np.float64}
    , kwargs = {'A_coeff': 0.21, 'B_coeff': -2.82, 'constant': 3.43}
)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


#### cudf.core.dataframe.DataFrame.drop()

In [6]:
# pandas
pandas_df.drop(1)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [7]:
# cudf
cudf_df.drop(1)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [8]:
# pandas
pandas_df.drop(range(1,7))

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [9]:
# cudf
cudf_df.drop(range(1,7))

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [10]:
# pandas
pandas_df.drop('word', axis=1)

Unnamed: 0,num,float,datetime,timedelta,char,category,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,D,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,,-17.8196


In [11]:
# cudf
cudf_df.drop('word', axis=1)

Unnamed: 0,num,float,datetime,timedelta,char,category,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,


In [12]:
# pandas
pandas_df.drop(['word', 'category'], axis=1)

Unnamed: 0,num,float,datetime,timedelta,char,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,,-17.8196


In [13]:
# cudf
cudf_df.drop(['word', 'category'], axis=1)

Unnamed: 0,num,float,datetime,timedelta,char,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,


#### cudf.core.dataframe.DataFrame.dropna()

In [14]:
# pandas
pandas_df.dropna()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896


In [15]:
# cudf
cudf_df.dropna()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...


In [16]:
# pandas
pandas_df.dropna(subset=['datetime'])

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [17]:
# cudf
cudf_df.dropna(subset=['datetime'])

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [18]:
# pandas
pandas_df.dropna(how='all')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [19]:
# cudf
cudf_df.dropna(how='all')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [20]:
# pandas
pandas_df.dropna(axis=1)

Unnamed: 0,num,float,char,category,output
0,39,6.88,C,D,-7.7816
1,11,4.21,A,D,-6.1322
2,31,4.71,U,D,-3.3422
3,40,0.93,P,B,9.2074
4,33,9.26,O,D,-15.7532
5,42,4.21,U,C,0.3778
6,36,3.01,T,D,2.5018
7,38,6.44,X,B,-6.7508
8,17,5.28,P,D,-7.8896
9,10,8.28,W,B,-17.8196


In [21]:
# cudf
cudf_df.dropna(axis=1)

Unnamed: 0,num,float,char,category
0,39,6.88,C,D
1,11,4.21,A,D
2,31,4.71,U,D
3,40,0.93,P,B
4,33,9.26,O,D
5,42,4.21,U,C
6,36,3.01,T,D
7,38,6.44,X,B
8,17,5.28,P,D
9,10,8.28,W,B


#### cudf.core.dataframe.DataFrame.fillna()

In [22]:
# pandas
pandas_df.fillna({'num': 1})

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [23]:
# cudf
cudf_df.fillna({'num': 1})

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


#### cudf.core.dataframe.DataFrame.join()

In [24]:
# pandas
pandas_categories = pd.DataFrame([
    ('B', 'cuDF')
    , ('C', 'BlazingSQL')
    , ('D', 'Dask')
], columns=['cat', 'name'])

pandas_df.join(pandas_categories)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output,cat,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816,B,cuDF
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322,C,BlazingSQL
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422,D,Dask
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074,,
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532,,
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778,,
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018,,
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508,,
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896,,
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196,,


In [25]:
# cudf
cudf_categories = cudf.DataFrame([
    ('B', 'cuDF')
    , ('C', 'BlazingSQL')
    , ('D', 'Dask')
], columns=['cat', 'name'])

cudf_df.join(cudf_categories)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,cat,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,B,cuDF
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,C,BlazingSQL
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",D,Dask
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,,
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,,
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,,
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,,
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,,
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,,
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,,


In [26]:
# pandas
pandas_df.join(pandas_categories, lsuffix='_l', rsuffix='_r', how='left')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output,cat,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816,B,cuDF
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322,C,BlazingSQL
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422,D,Dask
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074,,
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532,,
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778,,
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018,,
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508,,
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896,,
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196,,


In [27]:
# cudf
cudf_df.join(cudf_categories, lsuffix='_l', rsuffix='_r', how='left')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,cat,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,B,cuDF
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,C,BlazingSQL
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",D,Dask
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,,
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,,
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,,
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,,
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,,
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,,
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,,


#### cudf.core.dataframe.DataFrame.merge()

In [28]:
# pandas
pandas_categories = pd.DataFrame([
    ('B', 'cuDF')
    , ('C', 'BlazingSQL')
    , ('D', 'Dask')
], columns=['category', 'name'])

In [29]:
# cudf
cudf_categories = cudf.DataFrame([
    ('B', 'cuDF')
    , ('C', 'BlazingSQL')
    , ('D', 'Dask')
], columns=['category', 'name'])

In [30]:
# pandas
pandas_df.merge(pandas_categories, on='category')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816,Dask
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322,Dask
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422,Dask
3,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532,Dask
4,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018,Dask
5,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896,Dask
6,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074,cuDF
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508,cuDF
8,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196,cuDF
9,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778,BlazingSQL


In [31]:
cudf_df.merge(cudf_categories, on='category')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,Dask
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,Dask
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",Dask
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,cuDF
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,Dask
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,BlazingSQL
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,Dask
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,cuDF
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,Dask
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,cuDF


In [32]:
# pandas
pandas_df.merge(pandas_categories, on='category', how='left')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816,Dask
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322,Dask
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422,Dask
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074,cuDF
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532,Dask
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778,BlazingSQL
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018,Dask
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508,cuDF
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896,Dask
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196,cuDF


In [33]:
# cudf
cudf_df['category'] = cudf_df['category'].astype('str')
cudf_df.merge(cudf_categories, on='category', how='left')

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,name
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,Dask
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,Dask
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",Dask
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,cuDF
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,Dask
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,BlazingSQL
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,Dask
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,cuDF
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,Dask
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,cuDF


#### cudf.core.dataframe.DataFrame.rename()

In [34]:
# pandas
pandas_df.rename({0: 100})

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
100,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [35]:
# cudf
cudf_df.rename({0: 100})

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
100,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [36]:
# pandas
pandas_df.rename({'num': 'numbers'}, axis=1)

Unnamed: 0,numbers,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [37]:
# cudf
cudf_df.rename({'num': 'numbers'}, axis=1)

Unnamed: 0,numbers,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


#### cudf.core.dataframe.DataFrame.reset_index()

In [38]:
# pandas
pandas_df.reset_index()

Unnamed: 0,index,num,float,datetime,timedelta,char,category,word,string,output
0,0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [39]:
# cudf
cudf_df.reset_index()

Unnamed: 0,index,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


In [40]:
# pandas
pandas_df.reset_index(drop=True)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string,output
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [41]:
# cudf
cudf_df.reset_index(drop=True)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


#### cudf.core.dataframe.DataFrame.set_index()

In [42]:
# pandas
pandas_df.set_index('category')

Unnamed: 0_level_0,num,float,datetime,timedelta,char,word,string,output
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
D,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
D,11,4.21,NaT,NaT,A,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
D,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
B,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,tabular,If your workflow is fast enough on a single GP...,9.2074
D,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,parallel,If you want to distribute your workflow across...,-15.7532
C,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
D,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
B,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
D,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,dataframes,Dask is a flexible library for parallel comput...,-7.8896
B,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,python,,-17.8196


In [43]:
# cudf
cudf_df.set_index('category')

Unnamed: 0_level_0,num,float,datetime,timedelta,char,word,string
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
D,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,data,RAPIDS.ai is a suite of open-source libraries ...
D,11,4.21,,,A,cuDF,cuDF is a Python GPU DataFrame (built on the A...
D,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,memory,"cuDF allows for loading, joining, aggregating,..."
B,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,tabular,If your workflow is fast enough on a single GP...
D,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,parallel,If you want to distribute your workflow across...
C,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,GPUs,BlazingSQL provides a high-performance distrib...
D,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,,BlazingSQL is built on the RAPIDS GPU data sci...
B,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,csv,BlazingSQL lets you ETL raw data directly into...
D,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,dataframes,Dask is a flexible library for parallel comput...
B,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,python,


In [44]:
# pandas
pandas_df.set_index('category', drop=False)

Unnamed: 0_level_0,num,float,datetime,timedelta,char,category,word,string,output
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
D,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...,-7.7816
D,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...,-6.1322
D,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,...",-3.3422
B,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...,9.2074
D,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...,-15.7532
C,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...,0.3778
D,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...,2.5018
B,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...,-6.7508
D,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...,-7.8896
B,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,,-17.8196


In [45]:
# cudf
cudf_df.set_index('category', drop=False)

Unnamed: 0_level_0,num,float,datetime,timedelta,char,category,word,string
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
D,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
D,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
D,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
B,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
D,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
C,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
D,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
B,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
D,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
B,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,
