In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages=com.qubole.spark/spark-sql-kinesis_2.11/1.1.3-spark_2.4 pyspark-shell'
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
import boto3
import json

In [2]:
# make sure you have your AWS cerdentials set before you can run this

# creating the Kinesis stream
client = boto3.client('kinesis')
stream_name='pyspark-kinesis'

client.create_stream(
        StreamName=stream_name,
        ShardCount=1)

{'ResponseMetadata': {'RequestId': 'c2d85e3c-cc43-37de-95a9-de32bfbdb25f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'c2d85e3c-cc43-37de-95a9-de32bfbdb25f',
   'x-amz-id-2': 'nQUjPs4QPhvKG2hBexaRZbPXyD47szAqq0WqtdGRYMHs5VVCqMzLL+O9EmYTbtbqkw17GFJdV7pNfFnKrq6W3rn07w4ME5awGhP/dI+9/P4=',
   'date': 'Sat, 27 Jun 2020 17:11:42 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0'},
  'RetryAttempts': 0}}

In [3]:
# creating a couple of messages to send to kinesis
messages = [
    {'message_type': 'message1', 'count': 2},
    {'message_type': 'message2', 'count': 1},
    {'message_type': 'message1', 'count': 2},
    {'message_type': 'message3', 'count': 3},
    {'message_type': 'message1', 'count': 5}
]

In [4]:
for message in messages:
    client.put_record(
        StreamName=stream_name,
        Data=json.dumps(message),
        PartitionKey='part_key')

In [5]:
spark = SparkSession.builder \
         .master('local[*]') \
         .appName('PySparkKinesis') \
         .getOrCreate()

In [6]:
kinesis = spark \
        .readStream \
        .format('kinesis') \
        .option('streamName', stream_name) \
        .option('endpointUrl', 'https://kinesis.eu-west-1.amazonaws.com')\
        .option('region', 'eu-west-1') \
        .option('awsAccessKeyId', os.environ['KINESIS_ACCESS_KEY']) \
        .option('awsSecretKey', os.environ['KINESIS_SECRET_KEY']) \
        .option('startingposition', 'TRIM_HORIZON')\
        .load()\

In [10]:
schema = StructType([
            StructField("message_type", StringType()),
            StructField("count", IntegerType())])

In [15]:
kinesis\
    .selectExpr('CAST(data AS STRING)')\
    .select(from_json('data', schema).alias('data'))\
    .select('data.*')\
    .writeStream\
    .outputMode('append')\
    .format('console')\
    .trigger(once=True) \
    .start()\
    .awaitTermination()

In [16]:
# cleanup
client.delete_stream(StreamName=stream_name)

{'ResponseMetadata': {'RequestId': 'e6db1236-50ef-7819-b1aa-7c794fda075f',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'e6db1236-50ef-7819-b1aa-7c794fda075f',
   'x-amz-id-2': 'EIDwFsTGgXZ30hunYdYiCITKsWz+fnJiBSu3rEJ6KD9N5TUTh4L0V8splZoOkJJiKXcPXJscDp44bHl5ixlqYHwVci6THu1J',
   'date': 'Sat, 27 Jun 2020 15:55:59 GMT',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '0'},
  'RetryAttempts': 0}}