In [1]:
from pyspark.sql.streaming import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql import functions as f
from pyspark.sql.types import *
import time
import os

In [2]:
os.environ["PYSPARK_PYTHON"] = "python3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "python3"
os.environ["SPARK_HOME"] = "/home/emre/spark-3.0.1-bin-hadoop2.7"
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1 pyspark-shell'

In [3]:
spark_sql_kafka = "/home/emre/spark-3.0.1-bin-hadoop2.7/jars/spark-sql-kafka-0-10_2.12-3.0.1.jar"
kafka_clients = "/home/emre/spark-3.0.1-bin-hadoop2.7/jars/kafka-clients-2.6.0.jar"

In [4]:
kafka_topic_name = "derbytopic"
kafka_bootstrap_servers = 'localhost:9092'

In [5]:
spark = SparkSession \
        .builder \
        .appName("spark-kafka2") \
        .master("local[*]") \
        .config("spark.jars", spark_sql_kafka) \
        .config("spark.jars", kafka_clients) \
        .config("spark.driver.extraClassPath","/home/emre/spark-3.0.1-bin-hadoop2.7/jars/*.jar") \
        .config("spark.executor.extraClassPath","/home/emre/spark-3.0.1-bin-hadoop2.7/jars/*.jar") \
        .getOrCreate()

In [6]:
 data_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", kafka_bootstrap_servers) \
        .option("subscribe", kafka_topic_name) \
        .option("startingOffsets", "earliest") \
        .load()

In [7]:
twitter_df = data_df.selectExpr("CAST(value AS STRING)")

In [8]:
twitter_df.show()

+--------------------+
|               value|
+--------------------+
|{"text":"Abdulker...|
|{"text":"@smailSe...|
|{"text":"#Fenerba...|
|{"text":"Olsun ya...|
|{"text":"@alifuat...|
|{"text":"@AvMBoSS...|
|{"text":"RT @Just...|
|{"text":"Youtube ...|
|{"text":"RT @Cam0...|
|{"text":"RT @cEne...|
|{"text":"@yagosab...|
|{"text":"@lvntrbn...|
|{"text":"RT @Avru...|
|{"text":"Bakacak ...|
|{"text":"Ne olurs...|
|{"text":"RT @1982...|
|{"text":"RT @Tohu...|
+--------------------+



In [8]:
twitter_schema = StructType() \
        .add("text", StringType()) \
        .add("name", StringType()) \
        .add("location", StringType()) \
        .add("screen_name", StringType()) \
        .add("favourites_count", IntegerType()) \
        .add("verified", BooleanType()) \
        .add("followers_count",IntegerType()) \
        .add("friends_count",IntegerType()) \
        .add("retweet_count",IntegerType()) \
        .add("favourite_count",IntegerType())
        

In [9]:
twitter_df1 = twitter_df\
        .select(from_json(f.col("value"), twitter_schema)\
        .alias("wmt_twitter"))

In [10]:
twitter_df2 = twitter_df1.select("wmt_twitter.*")
twitter_df2.printSchema()

root
 |-- text: string (nullable = true)
 |-- name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- screen_name: string (nullable = true)
 |-- favourites_count: integer (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- followers_count: integer (nullable = true)
 |-- friends_count: integer (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- favourite_count: integer (nullable = true)



In [12]:
twitter_df2.show()

+--------------------+--------------------+------------------+---------------+----------------+--------+---------------+-------------+-------------+---------------+
|                text|                name|          location|    screen_name|favourites_count|verified|followers_count|friends_count|retweet_count|favourite_count|
+--------------------+--------------------+------------------+---------------+----------------+--------+---------------+-------------+-------------+---------------+
|Abdulkerim Durmaz...|        Fenerbahçe +|              null|        FB_arti|             480|   false|            462|          175|            0|              0|
|@smailSeyyar1 @al...| Mişli Gelecek Zaman|              null|       mustaphs|            1255|   false|            166|          260|            0|              0|
|#Fenerbahce diyec...|         Berke Akgül|Güngören, İstanbul|Nostaljik_Hayat|            1245|   false|            586|          160|            0|              0|
|        O

In [11]:
 query = twitter_df2 \
       .writeStream \
       .trigger(processingTime='5 seconds') \
       .outputMode("update") \
       .option("truncate", "false")\
       .format("console") \
       .start()

In [None]:
query.awaitTermination()

In [None]:
query.stop()