In [1]:
# Create app and generate access tokens at https://apps.twitter.com

from pyspark import SparkContext
from pyspark.streaming import StreamingContext

# creating SparkContext with 2 threads
sc = SparkContext(master='local[2]', appName='Twitter Processing')

# creating streaming context
streaming = StreamingContext(sc, batchDuration=15)

In [2]:
import json

# create DStream from localhost:port
tweet_ds = streaming.socketTextStream(hostname='localhost', port=4455) \
                    .map(json.loads)

# count words in each 15sec stream batch
words_ds = tweet_ds.map(lambda entry: entry['text']) \
                   .flatMap(lambda line: line.lower().split())\
                   .filter(lambda word: word.startswith('#'))

counts_ds = words_ds.map(lambda word: (word, 1)) \
                    .window(60, 15) \
                    .reduceByKey(lambda x, y: x + y)

# output first 10 elements of each RDD
counts_ds \
    .repartition(1) \
    .transform(lambda rdd: rdd.sortBy(lambda k_v: -k_v[1]))\
    .pprint(num=10)

# TODO: output top 10 most frequent words for each RDD
# http://spark.apache.org/docs/latest/streaming-programming-guide.html#transformations-on-dstreams
# http://spark.apache.org/docs/latest/programming-guide.html#transformations

# TODO: output top 10 most trending hashtags for each RDD

# TODO: use window to count most frequent words over last 10 RDDs
# http://spark.apache.org/docs/latest/streaming-programming-guide.html#window-operations

In [None]:
# TODO: train sentiment analysis model 
# TODO: output average sentiment for each hashtag
# http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/
# http://spark.apache.org/docs/latest/streaming-programming-guide.html#dataframe-and-sql-operations

In [None]:
# start spark stream processing
streaming.start()

# run until terminated
streaming.awaitTermination()

-------------------------------------------
Time: 2017-05-25 18:35:00
-------------------------------------------
('#crimea.', 1)
('#fsb', 1)
('#russia', 1)

-------------------------------------------
Time: 2017-05-25 18:35:15
-------------------------------------------
('#russia', 6)
('#comey', 2)
('#fbi', 1)
('#coltparanormal', 1)
('#crimea.', 1)
('#noevidence', 1)
('#moscow', 1)
('#trumprussia', 1)
('#t…', 1)
('#clinton', 1)
...

-------------------------------------------
Time: 2017-05-25 18:35:30
-------------------------------------------
('#russia', 11)
('#comey', 2)
('#fbi', 1)
('#coltparanormal', 1)
('#putin', 1)
('#moscow', 1)
('#refugees', 1)
('#clinton', 1)
('#gowdy', 1)
('#crimea.', 1)
...

-------------------------------------------
Time: 2017-05-25 18:35:45
-------------------------------------------
('#russia', 13)
('#comey', 3)
("#russia's", 2)
('#fbi', 2)
('#russia.', 2)
('#russia…', 2)
('#coltparanormal', 1)
('#grozny,', 1)
('#relax', 1)
('#moscow', 1)
...

--------