# Twitter Sentiment Analysis Part 3
## Create a real-time dashboard PixieApp 

In [38]:
# uncomment and run the line below to install tweepy if needed
# !pip install tweepy

## Set up twitter authentication
Make sure to fill in the tokens below before running this cell

In [1]:
from tweepy import OAuthHandler

# Go to http://apps.twitter.com and create an app.
# The consumer key and secret will be generated for you after
consumer_key="XXXX"
consumer_secret="XXXX"

# After the step above, you will be redirected to your app's page.
# Create an access token under the the "Your access token" section
access_token="XXXX"
access_token_secret="XXXX"

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

# Set up a Natural Language Understanding client instance

In [2]:
from watson_developer_cloud import NaturalLanguageUnderstandingV1
from watson_developer_cloud.natural_language_understanding_v1 import Features, SentimentOptions, EntitiesOptions

nlu = NaturalLanguageUnderstandingV1(
    version='2017-02-27',
    username='XXXX',
    password='XXXX'
)

## Create the Twitter Stream

In [3]:
from tweepy.streaming import StreamListener
from pixiedust.utils import Logger
from tweepy import Stream
from six import iteritems
import json
import csv
import shutil
from bs4 import BeautifulSoup as BS
from pyspark.sql.types import StructType, StructField, StringType, DateType, IntegerType, TimestampType

def ensure_dir(dir, delete_tree = False):
    if not os.path.exists(dir):
        os.makedirs(dir)
    elif delete_tree:
        shutil.rmtree(dir)
        os.makedirs(dir)
    return os.path.abspath(dir)

def init_output_dirs():
    root_dir = ensure_dir("output", delete_tree = True)
    output_dir = ensure_dir(os.path.join(root_dir, "raw"))
    return (root_dir, output_dir)
    
root_dir, output_dir = init_output_dirs()

field_metadata = [
    {"name": "created_at","type": TimestampType()},
    {"name": "text", "type": StringType()},
    {"name": "source", "type": StringType(), 
         "transform": lambda s: BS(s, "html.parser").text.strip()
    },
    {"name": "sentiment", "type": StringType()},
    {"name": "entity", "type": StringType()},
    {"name": "entity_type", "type": StringType()}
]
fieldnames = [f["name"] for f in field_metadata]
transforms = { 
    item['name']:item['transform'] for item in field_metadata if "transform" in item
}

@Logger()
class RawTweetsListener(StreamListener):
    def __init__(self):
        self.buffered_data = []
        self.counter = 0
        self.tweet_count = 0

    def flush_buffer_if_needed(self):
        "Check the buffer capacity and write to a new file if needed"
        length = len(self.buffered_data)
        if length > 0 and length % 10 == 0:
            with open(os.path.join( output_dir, "tweets{}.csv".format(self.counter)), "w") as fs:
                self.counter += 1
                if self.counter % 20 == 0:
                    self.counter = 0
                csv_writer = csv.DictWriter( fs, fieldnames = fieldnames)
                for data in self.buffered_data:
                    csv_writer.writerow(data)
            self.buffered_data = []
            
    def enrich(self, data):
        try:
            data['text'] = data['text'].replace('"', "'")
            response = nlu.analyze( 
                text = data['text'],
                features=Features(sentiment=SentimentOptions(), entities=EntitiesOptions())
            )
            data["sentiment"] = response["sentiment"]["document"]["label"]
            top_entity = response["entities"][0] if len(response["entities"]) > 0 else None
            data["entity"] = top_entity["text"] if top_entity is not None else ""
            data["entity_type"] = top_entity["type"] if top_entity is not None else ""
            return data
        except Exception as e:
            self.warn("Error from Watson service while enriching data: {}".format(e))

    def on_data(self, data):
        def transform(key, value):
            return transforms[key](value) if key in transforms else value
        data = self.enrich(json.loads(data))
        if data is not None:
            self.tweet_count += 1
            self.buffered_data.append(
                {key:transform(key,value) \
                     for key,value in iteritems(data) \
                     if key in fieldnames}
            )
            self.flush_buffer_if_needed()
        return True

    def on_error(self, status):
        print("An error occured while receiving streaming data: {}".format(status))
        return False

Pixiedust database opened successfully


In [4]:
def start_stream(queries):
    "Asynchronously start a new Twitter stream"
    stream = Stream(auth, RawTweetsListener())
    stream.filter(track=queries, languages=["en"], async=True)
    return stream

## Create a Spark Streaming DataFrame

In [5]:
def start_streaming_dataframe(output_dir):
    "Start a Spark Streaming DataFrame from a file source"
    schema = StructType(
        [StructField(f["name"], f["type"], True) for f in field_metadata]
    )
    return spark.readStream \
        .csv(
            output_dir,
            schema=schema,
            multiLine = True,
            timestampFormat = 'EEE MMM dd kk:mm:ss Z yyyy',
            ignoreTrailingWhiteSpace = True,
            ignoreLeadingWhiteSpace = True
        )

## Create an Run Spark Structured Queries

In [6]:
def start_parquet_streaming_query(csv_sdf):
    """
    Create an run a streaming query from a Structured DataFrame 
    outputing the results into a parquet database
    """
    streaming_query = csv_sdf \
      .writeStream \
      .format("parquet") \
      .option("path", os.path.join(root_dir, "output_parquet")) \
      .trigger(processingTime="2 seconds") \
      .option("checkpointLocation", os.path.join(root_dir, "output_chkpt")) \
      .start()
    return streaming_query

## Create a real-time dashboard

## StreamsManager class for controlling the lifecycle of the different streams

In [7]:
class StreamsManager():
    def __init__(self):
        self.twitter_stream = None
        self.csv_sdf = None
        
    def reset(self, search_query = None):
        if self.twitter_stream is not None:
            self.twitter_stream.disconnect()
        #stop all the active streaming queries and re_initialize the directories
        for query in spark.streams.active:
            query.stop()
        self.root_dir, self.output_dir = init_output_dirs()
        self.twitter_stream = start_stream([search_query]) if search_query is not None else None
        self.csv_sdf = start_streaming_dataframe(output_dir) if search_query is not None else None
        
    def __del__(self):
        self.reset()
        
streams_manager = StreamsManager()

## StreamingQueriesApp to live monitor the progress of the active Streaming Queries

In [8]:
from pixiedust.display.app import *
@PixieApp
class StreamingQueriesApp():
    @route()
    def main_screen(self):
        return """
<div class="no_loading_msg" pd_refresh_rate="5000" pd_options="show_progress=true">
</div>
        """
        
    @route(show_progress="true")
    def do_show_progress(self):
        return """
{%for query in this.spark.streams.active%}
    <div>
    <div class="page-header"> 
        <h1>Progress Report for Spark Stream: {{query.id}}</h1>
    <div>
    <table>
        <thead>
          <tr>
             <th>metric</th>
             <th>value</th>
          </tr>
        </thead>
        <tbody>
            {%for key, value in query.lastProgress.items()%}
            <tr>
                <td>{{key}}</td>
                <td>{{value}}</td>
            </tr>
            {%endfor%}
        </tbody>        
    </table>
{%endfor%}
        """

## TweetInsightApp shows the metrics in a dashboard 

In [9]:
import time
from wordcloud import WordCloud
import matplotlib.pyplot as plt

@PixieApp
class TweetInsightApp():    
    @route()
    def main_screen(self):
        return """
<style>
    div.outer-wrapper {
        display: table;width:100%;height:300px;
    }
    div.inner-wrapper {
        display: table-cell;vertical-align: middle;height: 100%;width: 100%;
    }
</style>
<div class="outer-wrapper">
    <div class="inner-wrapper">
        <div class="col-sm-3"></div>
        <div class="input-group col-sm-6">
          <input id="query{{prefix}}" type="text" class="form-control"
              value=""
              placeholder="Enter a search query (e.g. baseball)">
          <span class="input-group-btn">
            <button class="btn btn-default" type="button" pd_options="search_query=$val(query{{prefix}})">
                Go
            </button>
          </span>
        </div>
    </div>
</div>
        """
    
    @route(search_query="*")
    def do_search_query(self, search_query):
        streams_manager.reset(search_query)
        start_parquet_streaming_query(streams_manager.csv_sdf)
        while True:
            try:
                parquet_dir = os.path.join(root_dir, "output_parquet")
                self.parquet_df = spark.sql("select * from parquet.`{}`".format(parquet_dir))
                break
            except:
                time.sleep(5)
        return """
<div class="container">
    <div id="header{{prefix}}" class="row no_loading_msg" pd_refresh_rate="5000" pd_target="header{{prefix}}">
        <pd_script>
print("Number of tweets received: {}".format(streams_manager.twitter_stream.listener.tweet_count))
        </pd_script>
    </div>
    <div class="row" style="min-height:300px">
        <div class="col-sm-5">
            <div id="metric1{{prefix}}" pd_refresh_rate="10000" class="no_loading_msg"
                pd_options="display_metric1=true" pd_target="metric1{{prefix}}">
            </div>
        </div>
        <div class="col-sm-5">
            <div id="metric2{{prefix}}" pd_refresh_rate="12000" class="no_loading_msg"
                pd_options="display_metric2=true" pd_target="metric2{{prefix}}">
            </div>
        </div>
    </div>
    
    <div class="row" style="min-height:400px">
        <div class="col-sm-offset-1 col-sm-10">
            <div id="word_cloud{{prefix}}" pd_refresh_rate="20000" class="no_loading_msg"
                pd_options="display_wc=true" pd_target="word_cloud{{prefix}}">
            </div>
        </div>
    </div>
        """
    
    @route(display_metric1="*")
    def do_display_metric1(self, display_metric1):
        parquet_dir = os.path.join(root_dir, "output_parquet")
        self.parquet_df = spark.sql("select * from parquet.`{}`".format(parquet_dir))
        return """
<div class="no_loading_msg" pd_render_onload pd_entity="parquet_df">
    <pd_options>
    {
      "legend": "true",
      "keyFields": "sentiment",
      "clusterby": "entity_type",
      "handlerId": "barChart",
      "rendererId": "bokeh",
      "rowCount": "10",
      "sortby": "Values DESC",
      "noChartCache": "true"
    }
    </pd_options>
</div>
        """
    
    @route(display_metric2="*")
    def do_display_metric2(self, display_metric2):
        return """
<div class="no_loading_msg" pd_render_onload pd_entity="parquet_df">
    <pd_options>
    {
      "keyFields": "created_at",
      "rowCount": "1000",
      "handlerId": "lineChart",
      "clusterby": "sentiment",
      "lineChartType": "subplots",
      "legend": "false",
      "noChartCache": "true"
    }
    </pd_options>
</div>
        """
    
    @route(display_wc="*")
    @captureOutput
    def do_display_wc(self):
        text = "\n".join(
            [r['entity'] for r in self.parquet_df.select("entity").collect() if r['entity'] is not None]
        )
        plt.figure( figsize=(13,7) )
        plt.axis("off")
        plt.imshow(
            WordCloud(width=750, height=350).generate(text), 
            interpolation='bilinear'
        )

## Putting together the complete PixieApp using TemplateTabbedApp

In [None]:
from pixiedust.display.app import *
from pixiedust.apps.template import TemplateTabbedApp

@PixieApp
class TwitterSentimentApp(TemplateTabbedApp):
    def setup(self):
        self.apps = [
            {"title": "Tweets Insights", "app_class": "TweetInsightApp"},
            {"title": "Streaming Queries", "app_class": "StreamingQueriesApp"}
        ]
        
app = TwitterSentimentApp()
app.run()