# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [1]:
import cudf
import numpy as np
import pandas as pd

### Sample DataFrame

In [2]:
# pandas
pandas_df = pd.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
pandas_df['category'] = pandas_df['category'].astype('category')

In [3]:
# cudf
cudf_df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), np.timedelta64(14378,'s'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 None                     , 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), np.timedelta64(12909,'s'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), np.timedelta64(10466,'s'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), np.timedelta64(35558,'s'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), np.timedelta64(20480,'s'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), np.timedelta64(24409,'s'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), np.timedelta64(90171,'s'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), np.timedelta64(30532,'s'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), np.timedelta64(23552,'s'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['num', 'float', 'datetime', 'timedelta', 'char', 'category', 'word', 'string']
)
cudf_df['category'] = cudf_df['category'].astype('category')

---

# Object creation

---

In [4]:
# pandas
pd.DataFrame([1,2,3,4], columns=['ints'])

Unnamed: 0,ints
0,1
1,2
2,3
3,4


In [5]:
# cudf
cudf.DataFrame([1,2,3,4], columns=['ints'])

Unnamed: 0,ints
0,1
1,2
2,3
3,4


In [6]:
# pandas
pd.DataFrame({'ints': [1,2,3,4], 'strings': ['a','b','c',None]})

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


In [7]:
# cudf
cudf.DataFrame({'ints': [1,2,3,4], 'strings': ['a','b','c',None]})

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


In [8]:
# pandas
pandas_df_sample = pd.DataFrame()
pandas_df_sample['ints'] = [1,2,3,4]
pandas_df_sample['strings'] = ['a','b','c',None]
pandas_df_sample

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


In [9]:
# cudf
cudf_df_sample = cudf.DataFrame()
cudf_df_sample['ints'] = [1,2,3,4]
cudf_df_sample['strings'] = ['a','b','c',None]
cudf_df_sample

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


In [10]:
# pandas
pd.DataFrame([
    (1, 'a')
    , (2, 'b')
    , (3, 'c')
    , (4, None)
], columns=['ints', 'strings'])

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


In [11]:
# cudf
cudf.DataFrame([
    (1, 'a')
    , (2, 'b')
    , (3, 'c')
    , (4, None)
], columns=['ints', 'strings'])

Unnamed: 0,ints,strings
0,1,a
1,2,b
2,3,c
3,4,


## <span style="color:blue">DataFrame</span>

#### cudf.core.dataframe.DataFrame.from_pandas()

In [12]:
cudf.DataFrame.from_pandas(pandas_df)

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01.000000000,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43.000000000,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00.000000000,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02.000000000,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23.000000000,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26.000000000,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36.000000000,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40.000000000,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21.000000000,0 days 06:32:32,W,B,python,


#### cudf.core.dataframe.DataFrame.from_records()

In [13]:
# pandas
pd.DataFrame.from_records(pandas_df[['num', 'float']].to_records())

Unnamed: 0,index,num,float
0,0,39,6.88
1,1,11,4.21
2,2,31,4.71
3,3,40,0.93
4,4,33,9.26
5,5,42,4.21
6,6,36,3.01
7,7,38,6.44
8,8,17,5.28
9,9,10,8.28


In [14]:
# cudf
cudf.DataFrame.from_records(cudf_df[['num', 'float']].to_records())

Unnamed: 0,index,num,float
0,0,39,6.88
1,1,11,4.21
2,2,31,4.71
3,3,40,0.93
4,4,33,9.26
5,5,42,4.21
6,6,36,3.01
7,7,38,6.44
8,8,17,5.28
9,9,10,8.28


#### cudf.core.dataframe.DataFrame.to_csv()

In [15]:
# pandas
pandas_df.to_csv('../results/pandas_df_with_index.csv')

In [16]:
# cudf
cudf_df.to_csv('../results/cudf_df_with_index.csv')

In [17]:
# pandas
pandas_df.to_csv('../results/pandas_df_no_index_no_header.csv', index=False, header=False)

In [18]:
# cudf
cudf_df.to_csv('../results/cudf_df_no_index_no_header.csv', index=False, header=False)

In [19]:
# pandas
pandas_df.to_csv('../results/pandas_df_tab_sep.tsv', sep='\t')

In [20]:
# cudf
cudf_df.to_csv('../results/cudf_df_tab_sep.tsv', sep='\t')

In [21]:
# pandas
with open('../results/pandas_df_buffer.csv', 'w') as f:
    pandas_df.to_csv(f)

In [22]:
# cudf
with open('../results/cudf_df_buffer.csv', 'w') as f:
    cudf_df.to_csv(f)

#### cudf.core.dataframe.DataFrame.to_json()

In [23]:
# pandas
pandas_df.to_json('../results/pandas_df_default.json')

In [24]:
# cudf
cudf_df.to_json('../results/cudf_df_default.json')



In [25]:
# pandas
pandas_df.to_json('../results/pandas_df_records.json', orient='records', lines=True)

In [26]:
# cudf
cudf_df.to_json('../results/cudf_df_records.json', orient='records', lines=True)

In [27]:
# pandas
pandas_df.to_json('../results/pandas_df_iso_dttm.json', date_format='iso')

In [28]:
# cudf
cudf_df.to_json('../results/cudf_df_iso_dttm.json', date_format='iso')

#### cudf.core.dataframe.DataFrame.to_pandas()

In [29]:
cudf_df.to_pandas()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...
5,42,4.21,2020-10-01 10:02:23,0 days 05:41:20,U,C,GPUs,BlazingSQL provides a high-performance distrib...
6,36,3.01,2020-09-30 14:36:26,0 days 06:46:49,T,D,,BlazingSQL is built on the RAPIDS GPU data sci...
7,38,6.44,2020-10-10 08:34:36,1 days 01:02:51,X,B,csv,BlazingSQL lets you ETL raw data directly into...
8,17,5.28,2020-10-09 08:34:40,0 days 08:28:52,P,D,dataframes,Dask is a flexible library for parallel comput...
9,10,8.28,2020-10-03 03:31:21,0 days 06:32:32,W,B,python,


#### cudf.io.csv.read_csv()

In [30]:
# pandas
pandas_df_csv_read = pd.read_csv('../results/pandas_df_with_index.csv')
pandas_df_csv_read.head()

Unnamed: 0.1,Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [31]:
# cudf
cudf_df_csv_read = cudf.read_csv('../results/cudf_df_with_index.csv')
cudf_df_csv_read.head()

Unnamed: 0.1,Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08T12:12:01Z,0 days 03:59:38.000000000,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10T09:26:43Z,0 days 03:35:09.000000000,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,3,40,0.93,2020-10-11T17:10:00Z,0 days 02:54:26.000000000,P,B,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15T10:58:02Z,0 days 09:52:38.000000000,O,D,parallel,If you want to distribute your workflow across...


In [32]:
# pandas
pandas_df_csv_read = pd.read_csv('../results/pandas_df_with_index.csv', nrows=2)
pandas_df_csv_read.head()

Unnamed: 0.1,Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...


In [33]:
# cudf
cudf_df_csv_read = cudf.read_csv('../results/cudf_df_with_index.csv', nrows=2)
cudf_df_csv_read.head()

Unnamed: 0.1,Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08T12:12:01Z,0 days 03:59:38.000000000,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...


In [34]:
# pandas
pandas_df_csv_read = pd.read_csv(
    '../results/pandas_df_with_index.csv'
    , skiprows=1
    , names=['Index', 'num', 'float', 'datetime', 'timedelta', 'char',
       'category', 'word', 'string'])
pandas_df_csv_read.head()

Unnamed: 0,Index,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08 12:12:01,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10 09:26:43,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,3,40,0.93,2020-10-11 17:10:00,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15 10:58:02,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [35]:
# cudf 
cudf_df_csv_read = cudf.read_csv(
    '../results/cudf_df_with_index.csv'
    , skiprows=1
    , names=['Index', 'num', 'float', 'datetime', 'timedelta', 'char',
       'category', 'word', 'string'])
cudf_df_csv_read.head()

Unnamed: 0,Index,num,float,datetime,timedelta,char,category,word,string
0,0,39,6.88,2020-10-08T12:12:01Z,0 days 03:59:38.000000000,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,2,31,4.71,2020-10-10T09:26:43Z,0 days 03:35:09.000000000,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,3,40,0.93,2020-10-11T17:10:00Z,0 days 02:54:26.000000000,P,B,tabular,If your workflow is fast enough on a single GP...
4,4,33,9.26,2020-10-15T10:58:02Z,0 days 09:52:38.000000000,O,D,parallel,If you want to distribute your workflow across...


In [36]:
# pandas
pandas_df_csv_read = pd.read_csv('../results/pandas_df_tab_sep.tsv', delimiter='\t', usecols=['num', 'float'])
pandas_df_csv_read.head()

Unnamed: 0,num,float
0,39,6.88
1,11,4.21
2,31,4.71
3,40,0.93
4,33,9.26


In [37]:
# cudf
cudf_df_csv_read = cudf.read_csv('../results/cudf_df_tab_sep.tsv', delimiter='\t', usecols=['num', 'float'])
cudf_df_csv_read.head()

Unnamed: 0,num,float
0,39,6.88
1,11,4.21
2,31,4.71
3,40,0.93
4,33,9.26


#### cudf.io.json.read_json()

In [38]:
# pandas
pandas_df_json_read = pd.read_json('../results/pandas_df_default.json')
pandas_df_json_read['timedelta'] = pd.to_timedelta(pandas_df_json_read['timedelta'])
pandas_df_json_read.head()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,0 days 00:00:00.014378,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,NaT,NaT,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,0 days 00:00:00.012909,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,0 days 00:00:00.010466,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,0 days 00:00:00.035558,O,D,parallel,If you want to distribute your workflow across...


In [39]:
# cudf
cudf_df_json_read = cudf.read_json('../results/cudf_df_default.json')
cudf_df_json_read['timedelta'] = cudf_df_json_read['timedelta'].astype('timedelta64[ms]')
cudf_df_json_read.head()



Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01.000000000,0 days 03:59:38,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43.000000000,0 days 03:35:09,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00.000000000,0 days 02:54:26,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02.000000000,0 days 09:52:38,O,D,parallel,If you want to distribute your workflow across...


In [40]:
# pandas
pandas_df_json_read = pd.read_json('../results/pandas_df_records.json', lines=True)
pandas_df_json_read.head()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,2020-10-08 12:12:01,14378000.0,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,NaT,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,2020-10-10 09:26:43,12909000.0,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,2020-10-11 17:10:00,10466000.0,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,2020-10-15 10:58:02,35558000.0,O,D,parallel,If you want to distribute your workflow across...


In [41]:
# cudf
cudf_df_json_read = cudf.read_json('../results/cudf_df_records.json', lines=True, engine='cudf')
cudf_df_json_read.head()

Unnamed: 0,num,float,datetime,timedelta,char,category,word,string
0,39,6.88,1602159121000.0,14378000.0,C,D,data,RAPIDS.ai is a suite of open-source libraries ...
1,11,4.21,,,A,D,cuDF,cuDF is a Python GPU DataFrame (built on the A...
2,31,4.71,1602322003000.0,12909000.0,U,D,memory,"cuDF allows for loading, joining, aggregating,..."
3,40,0.93,1602436200000.0,10466000.0,P,B,tabular,If your workflow is fast enough on a single GP...
4,33,9.26,1602759482000.0,35558000.0,O,D,parallel,If you want to distribute your workflow across...


In [42]:
# pandas
import pandas as fmwrk
df = fmwrk.read_csv('../results/pandas_df_with_index.csv')
desc_stats = df.describe()

(
    df
    .groupby(by='category')
    .agg({'num': 'sum', 'char': 'count'})
    .reset_index()
)

Unnamed: 0,category,num,char
0,B,88,3
1,C,42,1
2,D,167,6


In [43]:
df[['category', 'num', 'char']]

Unnamed: 0,category,num,char
0,D,39,C
1,D,11,A
2,D,31,U
3,B,40,P
4,D,33,O
5,C,42,U
6,D,36,T
7,B,38,X
8,D,17,P
9,B,10,W


In [44]:
# cudf
import cudf as fmwrk
df = fmwrk.read_csv('../results/pandas_df_with_index.csv')
desc_stats = df.describe()

(
    df
    .groupby(by='category')
    .agg({'num': 'sum', 'char': 'count'})
    .reset_index()
)

Unnamed: 0,category,num,char
0,B,88,3
1,C,42,1
2,D,167,6
