In [None]:
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
from pyspark.sql.types import *

spark = SparkSession.builder\
                    .master("local")\
                    .appName("Structured Streaming - Twitter Sentiment")\
                    .getOrCreate()

In [None]:

kinesisStreamName = "twitter-data-kinesis" 
kinesisRegion = "us-east-1"

awsAccessKeyId = "" # update the access key
awsSecretKey = "" 

In [None]:

kinesisDF = spark \
  .readStream \
  .format("kinesis") \
  .option("streamName", kinesisStreamName)\
  .option("region", kinesisRegion) \
  .option("initialPosition", "latest") \
  .option("format", "json") \
  .option("awsAccessKey", awsAccessKeyId)\
  .option("awsSecretKey", awsSecretKey) \
  .option("inferSchema", "true") \
  .load()

df = kinesisDF \
  .writeStream \
  .format("memory") \
  .outputMode("append") \
  .queryName("tweets")  \
  .start()

In [None]:
df.stop()

In [None]:
%sql

select partitionKey, cast(data as string) from tweets;

In [None]:
tweets = spark.sql("select cast(data as string) from tweets")

In [None]:
tweets.printSchema()

In [None]:
tweets.show(5, truncate=False)

In [None]:
tweets.count()

In [None]:
tweets_sample = tweets.limit(50)
tweets_sample.cache()
tweets_sample.show(5)

In [None]:
tweets_sample.show(5, truncate=False)

In [None]:
%fs ls /mnt/

In [None]:
%fs ls /mnt/my_twitter_data_project/

In [None]:
sc = spark.sparkContext
sc.addPyFile("dbfs:/mnt/my_twitter_data_project/sparkLDA.zip")

In [None]:
from sparkLDA.config import n_topics, extra_for_stemmed, seedNum, file_schema
from sparkLDA.utils import show_topics, evaluate
from sparkLDA.processing import preprocess_text

In [None]:
tweets_sample.schema

In [None]:
tweets_sample.printSchema()

In [None]:
file_schema_str = StructType([StructField("tweet_text", StringType(), True), 
                          # StructField("hash_tag", ArrayType(StringType(), True), True), 
                          StructField("hash_tag", StringType(), True), 
                          StructField("created_at", StringType(), True), 
                          StructField("retweet_count", StringType(), True), 
                          StructField("favorite_count", StringType(), True), 
                          StructField("retweeted", StringType(), True), 
                          StructField("truncated", StringType(), True), 
                          StructField("id", StringType(), True), 
                          StructField("user_name", StringType(), True), 
                          StructField("screen_name", StringType(), True), 
                          StructField("followers_count", StringType(), True), 
                          StructField("location", StringType(), True), 
                          StructField("geo", StringType(), True),
                          StructField("invalid", StringType(), True)])

In [None]:
def parse_tweets(raw_records):
  for pdf in raw_records:
    # yield pdf.data.apply(lambda record: record.split("\t"))
    yield pdf.data.str.split(pat="\t", expand=True)
  
tweets_sample.mapInPandas(parse_tweets, schema = file_schema_str).show()

process the data in memory

In [None]:
df = tweets.mapInPandas(parse_tweets, schema = file_schema_str)
df = preprocess_text(df)

In [None]:
pipelinePath = "dbfs:/mnt/my_twitter_data_project/ML_models/LDA-pipeline-model_Oct_data"
from pyspark.ml import PipelineModel

In [None]:
savedPipelineModel = PipelineModel.load(pipelinePath)

df_with_topics = savedPipelineModel.transform(df)

In [None]:
import pyspark.sql.functions as F

from textblob import TextBlob

def get_sentiment_vectorized(texts):
    testimonials = texts.apply(lambda text: TextBlob(text).sentiment.polarity)
    return testimonials
  
getSentiment = F.pandas_udf(get_sentiment_vectorized, FloatType())

In [None]:
to_array = F.udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

df_with_topics = df_with_topics.withColumn("topicDistributionArray", 
                                         to_array("topicDistribution"))

In [None]:
cols_select = ['tweet_text', 'hash_tag', 'created_at', 'retweet_count', 'favorite_count']
print(cols_select)

In [None]:
df_with_topics = df_with_topics.select(*cols_select, *[(F.col("topicDistributionArray")[i])
                                 .alias("topic_"+str(i)) for i in range(n_topics)])

In [None]:
df_with_topics.show(5)

In [None]:
df_with_topics_sentiment = df_with_topics.withColumn("sentiment", getSentiment(F.col("tweet_text")))\
                            .withColumn("sentiment", F.format_number("sentiment", 3))

In [None]:
[]

In [None]:
df_with_topics_sentiment.select("tweet_text", "sentiment", *["topic_"+str(i) for i in range(n_topics)]).show(20, truncate=False)