# Twitter Sentiment Analysis with Pyspark

# Predicting Sentiments

First step in any Apache Spark programming is to create a SparkContext. SparkContext is needed when we want to execute operations in a cluster. SparkContext tells Spark how and where to access a cluster. It is first step to connect with Apache Cluster. 

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, SparkSession
import warnings

SCC_CHECKPOINT_PATH = "/Users/anujchaudhari/Desktop/256/project/samples/twitter_streaming/checkpoint"
STREAMING_SOCKET_IP = "192.168.0.100"
STREAMING_SOCKET_PORT = 5555
STREAMING_TIME_INTERVAL = 2

try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    
    spark = SparkSession.builder.appName("twitter").getOrCreate()
    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    print("Just created a SparkContext")
    
except ValueError:
    warnings.warn("SparkContext already exists in this scope")
    
    

# Create Spark Streaming Context

ssc = StreamingContext(sc, STREAMING_TIME_INTERVAL )
ssc.checkpoint(SCC_CHECKPOINT_PATH)
socket_stream = ssc.socketTextStream(STREAMING_SOCKET_IP, STREAMING_SOCKET_PORT)
lines = socket_stream.window(STREAMING_TIME_INTERVAL)

print("SparkContext Master: " + sc.master)

### Model Loading

In [None]:
from pyspark.ml import PipelineModel

pipeline = PipelineModel.load("Model_Twitter_Sentiment")

### Test Set Prediction 

### Tweet Cleaner

In [None]:
import re
from bs4 import BeautifulSoup
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def tweet_cleaner_updated(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    ret = (" ".join(words)).strip()
    return ret


### Spark Streaming Tweet Handling

In [None]:
import time
from pyspark.sql import Row
from pyspark.sql import SparkSession


def processTweets(rdd):
    try:        
        spark = SparkSession.builder.appName("twitter").getOrCreate()
        
        tweet = rdd.collect()
        if len(tweet) != 0:
            tweet = list(tweet[0])
        else:
            tweet = []

        rows = []
        for i in range(len(tweet)):
            cleaned_tweet = tweet_cleaner_updated(tweet[i])
            rows.append(Row(_c0=i,text=cleaned_tweet,original=tweet[i],target=0))

        if len(rows) == 0:
            rows.append(Row(_c0=1,text="empty",target=0))
            
        df = spark.createDataFrame(rows)
        df.registerTempTable("tweets")
        
    except Exception as e: 
        print(e)
    
lines = lines.map(lambda x: x.lower());
lines = lines.map(lambda x: x.replace(" rt " , " "))
lines = lines.map(lambda x: x.replace("\n" , " "))
lines = lines.reduce(lambda x,y : x + y)
lines = lines.map(lambda x: x.split(" $$$$$$ "))

lines.foreachRDD(lambda rdd: processTweets(rdd))



In [None]:
ssc.start()

### Redis Queue Config

In [None]:
import redis

config = {
    'host' : 'localhost',
    'port' : 6379,
    'db' : 0
}

redis_object = redis.StrictRedis(**config)

channel = "tweet_prediction"

### Get Tweet Data from temp table and predict the sentiment

In [None]:
import time
import re
import json

count = 0
predicted_tweets = 0
time.sleep(5)

while count < 30:
    
    print("Processing BLOCK " + str(count) )    
    
    df_all_tweets = sqlContext.sql( 'Select * from tweets' )
    
    predicted_tweets = pipeline.transform(df_all_tweets).collect()
    
    for tweet in predicted_tweets:
        #print("\n########################")
        #print(tweet.text)
        #print(tweet.prediction)

        # Send Data to Redis Queue
        message = {}
        message["text"] = tweet.original
        message["sentiment"] = tweet.prediction
        message_body = json.dumps(message)
        message = '{message_body}'.format(**locals()).encode('UTF-8')
        
        redis_object.publish(channel, message)

    
    count = count + 1

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

In [30]:
ssc.stop()