# cuDF Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

# Imports

In [6]:
import streamz
import cudf
import json

from streamz.dataframe import DataFrame

# cuStreamz I/O

#### streamz.from_kafka()

In [2]:
consumer_conf = {'bootstrap.servers': 'kafka0:19092,kafka1:19095',
                 'group.id': 'custreamz'
                }

source = streamz.Stream.from_kafka(
    ['docs_log']
    , consumer_conf
    , poll_interval=1
    , asynchronous=True
    , start=False
)

In [3]:
'''
This is a helper function to do some data pre-processing.
This also prints out the word count for each batch.
'''
def process_message(message):
    words = []
    words = words + list(json.loads(message).values())[0].split(' ')
    
    # Create a cudf dataframe
    batch_df = cudf.DataFrame({'word': words, 'count': [1]*len(words)})
    
    # [Optional] Show local (stateless) word count for this batch  
    local_word_count = batch_df.groupby('word').sum()
    print(local_word_count)
    
    return batch_df

stream_df = source.map(process_message)

# Create a streamz dataframe to get stateful word count
sdf = DataFrame(stream_df, example=cudf.DataFrame({'word':[], 'count':[]}))

# Formatting the print statements
def print_format(sdf):
    return sdf

# Print cumulative word count from the start of the stream, after every batch. 
# One can also sink the output to a list.
sdf.groupby('word').sum().stream.gather().map(print_format)

Output()

In [4]:
source.start()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
and,18
bar,28
mike,9
pipe,9
soap,28


#### streamz.from_kafka_batched()

In [None]:
# Kafka consumer configuration
consumer_conf = {'bootstrap.servers': 'kafka0:19092,kafka1:19095',
                 'group.id': 'custreamz',
                 'session.timeout.ms': '60000'
                }

source = streamz.Stream.from_kafka_batched(
    'docs_log'
    , consumer_conf
    , poll_interval='2s'
    , asynchronous=True
    , dask=False
    , engine="cudf"
    , start=False
)

In [None]:
def process_batch(messages):
    batch_df = cudf.DataFrame()
    
    for message in messages:
        df_split = messages[message].str.tokenize()
        df_split = (
            df_split
            .to_frame('word')
            .reset_index()
            .groupby(by='word')
            .agg({'index': 'count'})
            .rename(columns={'index': 'count'})
            .reset_index()
        )
        print("\nWord Count for this batch:")
        
        batch_df = cudf.concat([batch_df, df_split])
    
    return batch_df

stream_df = source.map(process_batch)

# Create a streamz dataframe to get stateful word count
sdf = DataFrame(stream_df, example=cudf.DataFrame({'word':[], 'count':[]}))

# Formatting the print statements
def print_format(sdf):
    print("\nGlobal Word Count:")
    return sdf

# Print cumulative word count from the start of the stream, after every batch. 
# One can also sink the output to a list.
sdf.groupby('word').sum().stream.gather().map(print_format)

In [None]:
source.start()

#### streamz.from_textfile()

In [None]:
source = streamz.Stream.from_textfile(
    'sample.json'
    , poll_interval=2
    , delimiter='\n'
    , start=False
    , from_end=False
)

In [None]:
def process_message(message):
    words = []
    words = words + list(json.loads(message).values())[0].split(' ')
    
    # Create a cudf dataframe
    batch_df = cudf.DataFrame({'word': words, 'count': [1]*len(words)})
    
    # [Optional] Show local (stateless) word count for this batch  
    local_word_count = batch_df.groupby('word').sum()
    print(local_word_count)
    
    return batch_df
    
stream_df = source.map(process_message)

# Create a streamz dataframe to get stateful word count
sdf = DataFrame(stream_df, example=cudf.DataFrame({'word':[], 'count':[]}))

# Formatting the print statements
def print_format(sdf):
    print("\nGlobal Word Count:")
    return sdf

# Print cumulative word count from the start of the stream, after every batch. 
# One can also sink the output to a list.
sdf.groupby('word').sum().stream.gather().map(print_format)

In [None]:
source.start()

#### streamz.to_dataframe()

In [7]:
source = streamz.Stream.from_textfile(
    'sample.json'
    , poll_interval=2
    , delimiter='\n'
    , start=False
    , from_end=False
)

In [10]:
def process_message(message):
    words = []
    words = words + list(json.loads(message).values())[0].split(' ')
    
    # Create a cudf dataframe
    batch_df = cudf.DataFrame({'word': words, 'count': [1]*len(words)})
    
    # [Optional] Show local (stateless) word count for this batch  
    local_word_count = batch_df.groupby('word').sum()
    print(local_word_count)
    
    return batch_df
    
stream_df = source.map(process_message)

# Create a streamz dataframe to get stateful word count
sdf = stream_df.to_dataframe(example=cudf.DataFrame({'word':[], 'count':[]}))

# Formatting the print statements
def print_format(sdf):
    print("\nGlobal Word Count:")
    return sdf

# Print cumulative word count from the start of the stream, after every batch. 
# One can also sink the output to a list.
sdf.groupby('word').sum().stream.gather().map(print_format)

Output()

In [11]:
source.start()

Unnamed: 0_level_0,count
word,Unnamed: 1_level_1
and,2
bar,2
mike,1
pipe,1
soap,2
