In [None]:
import sys
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.streaming import StreamingContext
from pyspark.streaming.kinesis import KinesisUtils, InitialPositionInStream
from pyspark.sql.types import *

spark = SparkSession.builder\
                    .master("local")\
                    .appName("Structured Streaming - Twitter Sentiment")\
                    .getOrCreate()

In [None]:

kinesisStreamName = "twitter-data-kinesis" 
kinesisRegion = "us-east-1"

awsAccessKeyId = "" # update the access key
awsSecretKey = "" 

In [None]:

kinesisDF = spark \
  .readStream \
  .format("kinesis") \
  .option("streamName", kinesisStreamName)\
  .option("region", kinesisRegion) \
  .option("initialPosition", "latest") \
  .option("format", "json") \
  .option("awsAccessKey", awsAccessKeyId)\
  .option("awsSecretKey", awsSecretKey) \
  .option("inferSchema", "true") \
  .load()

df = kinesisDF \
  .writeStream \
  .format("memory") \
  .outputMode("append") \
  .queryName("tweets")  \
  .start()

In [None]:
df.stop()

In [None]:
%sql

select partitionKey, cast(data as string) from tweets;

partitionKey,data
welcomehier,"The pope has joined forces with #Microsoft and #IBM to create a doctrine for ethical #AI and facial recognition. Here's how the Vatican wants to shape it👇 https://t.co/TW653XayEE	Microsoft, IBM, AI	Fri Nov 27 20:28:00 +0000 2020	0	0	True	False	1332420933169143809	AbdulrahmanAlSultan 🇸🇦	welcomehier	2482	SA	None"
AlexeyChang,"Machine Learning with Core ML 2 and Swift 5 @udemy ➡️https://t.co/xHqTShMund #MachineLearning #Swift5 #AugmentedReality #DeepLearning #Analytics #DataScience #AI #IoT #IIoT #Python #RStats #CloudComputing #Serverless #Linux #Coding #100DaysofCode #100DaysOfMLCode #udemycoupon https://t.co/qD5q6TLm4a	MachineLearning, Swift5, AugmentedReality, DeepLearning, Analytics, DataScience, AI, IoT, IIoT, Python, RStats, CloudComputing, Serverless, Linux, Coding, 100DaysofCode, 100DaysOfMLCode, udemycoupon	Fri Nov 27 20:28:04 +0000 2020	0	0	True	False	1332420949266886659	Alexey Chang	AlexeyChang	146	Russia	None"
UW_CSTV,New research: #AI systems lack common sense needed to generate plausible sentences https://t.co/ZDa0IHfkvc via @TechXplore_com	AI	Fri Nov 27 20:28:07 +0000 2020	0	0	False	False	1332420963556843524	CSTV	UW_CSTV	190	University of Waterloo	None
fintechna,"The latest Breaking Banks Asia! https://t.co/EuUiryRZI1 Thanks to @LeeSlaterSCB @MaximilieMoreau @AlgorithmXLab #fintech #ai	fintech, ai	Fri Nov 27 20:28:09 +0000 2020	0	0	True	False	1332420969680531456	FINTECHNA	fintechna	27980	LON | NYC | HKG | SYD | SIN	None"
ML_Tweet_Bot,"Machine Learning with Core ML 2 and Swift 5 @udemy ➡️https://t.co/xHqTShMund #MachineLearning #Swift5 #AugmentedReality #DeepLearning #Analytics #DataScience #AI #IoT #IIoT #Python #RStats #CloudComputing #Serverless #Linux #Coding #100DaysofCode #100DaysOfMLCode #udemycoupon https://t.co/qD5q6TLm4a	MachineLearning, Swift5, AugmentedReality, DeepLearning, Analytics, DataScience, AI, IoT, IIoT, Python, RStats, CloudComputing, Serverless, Linux, Coding, 100DaysofCode, 100DaysOfMLCode, udemycoupon	Fri Nov 27 20:28:09 +0000 2020	0	0	True	False	1332420971844788229	Machine Learning Bot	ML_Tweet_Bot	4149	None	None"
MarkTwa01197091,"Drone taxis are a reality in China! #AI #MachineLearning #Robotics #drone @XHNews @SpirosMargaris @DeepLearn007 @HaroldSinnott @gvalan @Paula_Piccard @ipfconline1 @ShiCooks @diioannid @mvollmer1 @Ym78200 @kalydeoo @Nicochan33 @3itcom @Fabriziobustama https://t.co/Sva8fpJgl1	AI, MachineLearning, Robotics, drone	Fri Nov 27 20:28:09 +0000 2020	0	0	True	False	1332420972692074496	Mark Twain	MarkTwa01197091	10583	None	None"
mvollmer1,"6 ways #5G mobile #Broadband will impact the world @wef #AI #MachineLearning #IoT #CES2021 #Automation #4IR #DigitalTransformation #AR #FutureofWork cc @SpirosMargaris @jblefevre60 @Nicochan33 @AudreyDesisto @mvollmer1 @Paula_Piccard https://t.co/lwpBNyGICs	5G, Broadband, AI, MachineLearning, IoT, CES2021, Automation, 4IR, DigitalTransformation, AR, FutureofWork	Fri Nov 27 20:28:13 +0000 2020	0	0	True	False	1332420989892894720	Dr. Marcell Vollmer #StayHome #StaySafe	mvollmer1	65306	Frankfurt, Germany	None"
askN0B0DY,"#Trending: UN experts sound alarm over AI-enhanced racial profiling - The Jakarta Post - Jakarta Post #ai #ml #neuralnetworks #deeplearning #machinelearning Read More Here: https://t.co/Cj9HVuiD9Q	Trending, ai, ml, neuralnetworks, deeplearning, machinelearning	Fri Nov 27 20:28:16 +0000 2020	0	0	True	False	1332420999552315394	Nobody	askN0B0DY	954	Some Server	None"
dcasacuberta,"Looks promising... Fri Nov 27 20:28:18 +0000 2020	0	0	False	False	1332421009820045314	David Casacuberta	dcasacuberta	2095	Barcelona, Spain	None"
SvonFriedeburg,"#GoodNewsFriday: Using artificial intelligence, @NASA scientists mapped 1.8 billion trees to better understand climate change! Fascinating and indicative of the positive impact #AI can have. https://t.co/ua6iVmMZYH	GoodNewsFriday, AI	Fri Nov 27 20:28:24 +0000 2020	0	0	False	True	1332421034054725632	Stephanie von Friedeburg	SvonFriedeburg	3124	Washington, DC	None"


In [None]:
tweets = spark.sql("select cast(data as string) from tweets")

In [None]:
tweets.printSchema()

In [None]:
tweets.show(5, truncate=False)

In [None]:
tweets.count()

In [None]:
tweets_sample = tweets.limit(50)
tweets_sample.cache()
tweets_sample.show(5)

In [None]:
tweets_sample.show(5, truncate=False)

In [None]:
%fs ls /mnt/

path,name,size
dbfs:/mnt/anonymous_telecom/,anonymous_telecom/,0
dbfs:/mnt/aws_logs/,aws_logs/,0
dbfs:/mnt/bikeshare/,bikeshare/,0
dbfs:/mnt/cdr/,cdr/,0
dbfs:/mnt/china_telecom_churn/,china_telecom_churn/,0
dbfs:/mnt/geo/,geo/,0
dbfs:/mnt/movie/,movie/,0
dbfs:/mnt/my_twitter_data_project/,my_twitter_data_project/,0
dbfs:/mnt/orange/,orange/,0
dbfs:/mnt/paysim_fraud/,paysim_fraud/,0


In [None]:
%fs ls /mnt/my_twitter_data_project/

path,name,size
dbfs:/mnt/my_twitter_data_project/2020/,2020/,0
dbfs:/mnt/my_twitter_data_project/EMR_bootstrap.sh,EMR_bootstrap.sh,231
dbfs:/mnt/my_twitter_data_project/ML_models/,ML_models/,0
dbfs:/mnt/my_twitter_data_project/_metadata,_metadata,0
dbfs:/mnt/my_twitter_data_project/archived/,archived/,0
dbfs:/mnt/my_twitter_data_project/e-C5ILVFSIV1XJG93IA6UOH3E9D/,e-C5ILVFSIV1XJG93IA6UOH3E9D/,0
dbfs:/mnt/my_twitter_data_project/e-DAI1FF74OJ28MX3NGIPULOOE7/,e-DAI1FF74OJ28MX3NGIPULOOE7/,0
dbfs:/mnt/my_twitter_data_project/e-GZ60A1KFQY7FICPF38QRQ33A/,e-GZ60A1KFQY7FICPF38QRQ33A/,0
dbfs:/mnt/my_twitter_data_project/parquet/,parquet/,0
dbfs:/mnt/my_twitter_data_project/sparkLDA.zip,sparkLDA.zip,9752


In [None]:
sc = spark.sparkContext
sc.addPyFile("dbfs:/mnt/my_twitter_data_project/sparkLDA.zip")

In [None]:
from sparkLDA.config import n_topics, extra_for_stemmed, seedNum, file_schema
from sparkLDA.utils import show_topics, evaluate
from sparkLDA.processing import preprocess_text

In [None]:
tweets_sample.schema

In [None]:
tweets_sample.printSchema()

In [None]:
file_schema_str = StructType([StructField("tweet_text", StringType(), True), 
                          # StructField("hash_tag", ArrayType(StringType(), True), True), 
                          StructField("hash_tag", StringType(), True), 
                          StructField("created_at", StringType(), True), 
                          StructField("retweet_count", StringType(), True), 
                          StructField("favorite_count", StringType(), True), 
                          StructField("retweeted", StringType(), True), 
                          StructField("truncated", StringType(), True), 
                          StructField("id", StringType(), True), 
                          StructField("user_name", StringType(), True), 
                          StructField("screen_name", StringType(), True), 
                          StructField("followers_count", StringType(), True), 
                          StructField("location", StringType(), True), 
                          StructField("geo", StringType(), True),
                          StructField("invalid", StringType(), True)])

In [None]:
def parse_tweets(raw_records):
  for pdf in raw_records:
    # yield pdf.data.apply(lambda record: record.split("\t"))
    yield pdf.data.str.split(pat="\t", expand=True)
  
tweets_sample.mapInPandas(parse_tweets, schema = file_schema_str).show()

process the data in memory

In [None]:
df = tweets.mapInPandas(parse_tweets, schema = file_schema_str)
df = preprocess_text(df)

In [None]:
pipelinePath = "dbfs:/mnt/my_twitter_data_project/ML_models/LDA-pipeline-model_Oct_data"
from pyspark.ml import PipelineModel

In [None]:
savedPipelineModel = PipelineModel.load(pipelinePath)

df_with_topics = savedPipelineModel.transform(df)

In [None]:
import pyspark.sql.functions as F

from textblob import TextBlob

def get_sentiment_vectorized(texts):
    testimonials = texts.apply(lambda text: TextBlob(text).sentiment.polarity)
    return testimonials
  
getSentiment = F.pandas_udf(get_sentiment_vectorized, FloatType())

In [None]:
to_array = F.udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

df_with_topics = df_with_topics.withColumn("topicDistributionArray", 
                                         to_array("topicDistribution"))

In [None]:
cols_select = ['tweet_text', 'hash_tag', 'created_at', 'retweet_count', 'favorite_count']
print(cols_select)

In [None]:
df_with_topics = df_with_topics.select(*cols_select, *[(F.col("topicDistributionArray")[i])
                                 .alias("topic_"+str(i)) for i in range(n_topics)])

In [None]:
df_with_topics.show(5)

In [None]:
df_with_topics_sentiment = df_with_topics.withColumn("sentiment", getSentiment(F.col("tweet_text")))\
                            .withColumn("sentiment", F.format_number("sentiment", 3))

In [None]:
[]

In [None]:
df_with_topics_sentiment.select("tweet_text", "sentiment", *["topic_"+str(i) for i in range(n_topics)]).show(20, truncate=False)