In [None]:
import pandas as pd

In [None]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Twitter_Sentiment_NLP").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [None]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://ilanp-bucket.s3.us-west-2.amazonaws.com/sentiment_analysis_10k.csv"
spark.sparkContext.addFile(url)
tweet_df = spark.read.csv(SparkFiles.get("sentiment_analysis_10k.csv"), sep=",", header=True, inferSchema=True)

In [None]:
tweet_df.show(5)

In [None]:
list = [
    {"polarity": 1.0, "label" : 1.0, "text" : "I am so happy to be here today!"},
    {"polarity" : 0.0,"label" : 0.0, "text" : "Today is a terrible day."},
    {"polarity" : 1.0,"label" : 1.0, "text" : "I am so in love today!"}
]

In [None]:
tweet_df2 = spark.createDataFrame(list)
#tweet_df2 = tweet_df
tweet_df2.show()

In [None]:
# Import functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [None]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature
data_df = tweet_df2.withColumn('length', length(tweet_df2['text']))
data_df.show()

In [None]:
# Create all the features to the data set
#pos_neg_to_num = StringIndexer(inputCol='polarity',outputCol='label')
tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [None]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[tokenizer, stopremove, hashingTF, idf, clean_up])

In [None]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

In [None]:
cleaned.show()

In [None]:
cleaned.select(['polarity','label','features']).show(truncate=False)

In [None]:
# Break data down into a training set and a testing set
#training, testing = cleaned.randomSplit([0.7, 0.3], 21)

In [None]:
#from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
# Create a Naive Bayes model and fit training data
#nb = NaiveBayes()
#predictor = nb.fit(training)

In [None]:
#nb_path = "./nb"
#nb.save(nb_path)

In [None]:
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
# Restore the saved NaiveBayes classifier
nb2 = NaiveBayes.load("./nb")
nb2.getSmoothing()

In [None]:
#predictor.save("./nb_model")

In [None]:
#Restored the trained predictor
predictor2 = NaiveBayesModel.load("./nb_model")

In [None]:
test_results = predictor2.transform(cleaned)
test_results.show(20) 

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

acc_eval = BinaryClassificationEvaluator(labelCol='label', rawPredictionCol='prediction')
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting Text Sentiment was: %f" % acc)

In [None]:
df = test_results.select("text","label","prediction", "probability").toPandas()

In [None]:
df

In [None]:
from sklearn.metrics import confusion_matrix
# Generate the confusion matrix
cm = confusion_matrix(df["label"], df["prediction"])
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"]
)

# Displaying results
display(cm_df)

In [None]:
# Store environmental variable
from getpass import getpass
password = getpass('Provide Password')

# Configure settings for RDS
mode = "overwrite"
jdbc_url="jdbc:postgresql://database-1.c3f2jo4rdylg.us-west-2.rds.amazonaws.com:5432/sentiment_analysis"
config = {"user":"postgres", 
          "password": password, 
          "driver":"org.postgresql.Driver"}

In [None]:
test_results["polarity",'text','new_date',"length","label", "token_text","features", "prediction"].show(truncate=False)

In [None]:
for col in test_results.dtypes:
    print(col[0]+" , "+col[1])

In [None]:
from pyspark.sql.functions import col, concat_ws
test_results2 = test_results.withColumn("token_text", concat_ws(",",col("token_text")))
test_results3 = test_results2.withColumn("stop_tokens", concat_ws(",",col("stop_tokens")))
test_results3.show(truncate=False)

In [None]:
# Write DataFrame to active_user table in RDS
test_results['polarity','text','new_date',"length","label", "token_text", "prediction"].write.jdbc(url=jdbc_url, table='test_results', mode=mode, properties

In [None]:
import requests
import os
import json
import pandas as pd
from flask import session


# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
with open('../myconfig.json','r') as fh:
    config = json.load(fh)
os.environ["BEARER_TOKEN"] = config["BEARER_TOKEN"]

bearer_token = os.environ.get("BEARER_TOKEN")


def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FilteredStreamPython"
    return r

def create_rules(hashtag_data):
        new_rules = '{"rules" : ['
        counter = 0
        for hashtag in hashtag_data['tw_trends']:
            counter = counter + 1
            if counter == 1:
                new_rules = new_rules + '{"value": "' + hashtag['hashtag'] + ' -is:retweet lang:en -has:links -has:media", "tag": "' + hashtag['hashtag'] + '"}'
            else:
                new_rules = new_rules + ',{"value": "' + hashtag['hashtag'] + ' -is:retweet lang:en -has:links -has:media", "tag": "' + hashtag['hashtag'] + '"}'
        new_rules = new_rules + ']}'        
        print(new_rules)
        return(new_rules)

def get_rules():
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream/rules", auth=bearer_oauth
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print('get_rules() response:')    
    print(json.dumps(response.json()))
    return response.json()


def delete_all_rules(rules):
    if rules is None or "data" not in rules:
        return None

    ids = list(map(lambda rule: rule["id"], rules["data"]))
    payload = {"delete": {"ids": ids}}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        auth=bearer_oauth,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot delete rules (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    print('delete_all_rules(rules) response:')    
    
    (json.dumps(response.json()))


def set_rules(rules):
    # You can adjust the rules if needed 
    # if the passed in rules is null, then 
    if rules is None:
        sample_rules = [
            {"value": "dog has:images", "tag": "dog pictures"},
            {"value": "cat has:images -grumpy", "tag": "cat pictures"},
        ]
    else:
        print('assigning specified rules')
        sample_rules = rules
    payload = {"add": sample_rules}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        auth=bearer_oauth,
        json=payload,
    )
    if response.status_code != 201:
        raise Exception(
            "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print('set_rules(delete) response:')      
    print(json.dumps(response.json()))


def get_stream(countOfTweets):
    countOfTweets = int(countOfTweets)
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream", auth=bearer_oauth, stream=True,
    )
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Cannot get stream (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    all_responses = []
    count = 0
    for response_line in response.iter_lines():
        if count >= countOfTweets:
            break;

        if response_line:
            # get the current tweet json
            json_response = json.loads(response_line)    
 
            # Keep count of the processed tweets
            count = count + 1 

            # initialize a dict to hold the current tweet
            tweet = {}

            # extract the data from the tweet and store it in our variable
            tweet['count'] = count
            tweet['id'] = json_response["data"]["id"]
            tweet["text"] = json_response["data"]["text"]
            tweet["tag"] = json_response["matching_rules"][0]["tag"]

            # Update Session with current tweet.
            #session['current_tweet'] = tweet 

            # add the tweet to our response list
            all_responses.append(tweet)

            #print(json.dumps(json_response, indent=4, sort_keys=True))
            print('tweets streamed: ' + str(count))
    #take the streamed responses, put it into a dataframe and print the dataframe        
    #df = pd.DataFrame(all_responses)
    #print(df)

    #return the streamed responses         
    return(all_responses)


            

def main():
    rules = get_rules()
    delete = delete_all_rules(rules)
    set = set_rules(delete)
    get_stream(set) 
    


#if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.sql.functions import length
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel

# Create the spark session
spark = SparkSession.builder.appName("Twitter_Sentiment_NLP").getOrCreate()

#Pre-Load the classifier and the model
# Load the saved NaiveBayes Classifier
nb = NaiveBayes.load("../static/resources/nb")

#Restored the trained predictor (Trained on 1 mil tweets)
predictor = NaiveBayesModel.load("../static/resources/nb_model")

def eval_text_single(text, polarity = 1.0):
    list = [
    {"polarity": polarity, "text" : text}
    ]

    # The pipeline doesn't work as well when it it just one record in the list, so creating a fake list and adding the request to it.
    text_list = [{"text": "I am so happy for this text!  I can now have everything I want.", "polarity": 1.0},
             {"text": "This sucks!  I don't like this anymore.", "polarity": 0.0},
             {"text" : "This is a bad text.", "polarity" : 0.0},
             {"text": "I love you.", "polarity": 0.0},
             {"text": "Wow!  I can't believe how great this is.", "polarity": 0.0},
            ]
    
    # Add to the fake list
    text_list.append({"text" : text, "polarity" : polarity})
    
    tweet_df = spark.createDataFrame(text_list)

    # Create a length column to be used as a future feature
    data_df = tweet_df.withColumn('length', length(tweet_df['text']))

    # Create all the features to the data set
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopRemove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
    hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
    idf = IDF(inputCol='hash_token', outputCol='idf_token')

    # Create feature vectors
    clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

    # Create and run a data processing Pipeline
    data_prep_pipeline = Pipeline(stages=[tokenizer, stopRemove, hashingTF, idf, clean_up])

    # Fit and transform the pipeline
    cleaner = data_prep_pipeline.fit(data_df)
    cleaned = cleaner.transform(data_df)

    # Load the saved NaiveBayes Classifier
    #nb = NaiveBayes.load("static/resources/nb")

    #Restored the trained predictor (Trained on 1 mil tweets)
    #predictor = NaiveBayesModel.load("static/resources/nb_model")

    #Predict the sentiment of the text using the restored predictor
    test_results = predictor.transform(cleaned)

    df = test_results.select("text","prediction", "probability").toPandas()

    positives = [prob[1] for prob in df['probability']]
    df['probability'] = positives
    
    #Prepare the results, show the first row 
    result = {}
    result["text"] = df["text"][5]
    result["prediction"] = df["prediction"][5]
    result["probability"] = df["probability"][5]

    if result["prediction"] == 1:
        result["prediction"] = "Positive"
    else: 
        result["prediction"] = "Negative"

    return(result)

def eval_text_list(text_list):

    tweet_df = spark.createDataFrame(text_list)

    # Create a length column to be used as a future feature
    data_df = tweet_df.withColumn('length', length(tweet_df['text']))

    # Create all the features to the data set
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopRemove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
    hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
    idf = IDF(inputCol='hash_token', outputCol='idf_token')

    # Create feature vectors
    clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

    # Create and run a data processing Pipeline
    data_prep_pipeline = Pipeline(stages=[tokenizer, stopRemove, hashingTF, idf, clean_up])

    # Fit and transform the pipeline
    cleaner = data_prep_pipeline.fit(data_df)
    cleaned = cleaner.transform(data_df)

    # Load the saved NaiveBayes Classifier
    #nb = NaiveBayes.load("static/resources/nb")

    #Restored the trained predictor (Trained on 1 mil tweets)
    #predictor = NaiveBayesModel.load("static/resources/nb_model")

    #Predict the sentiment of the text using the restored predictor
    test_results = predictor.transform(cleaned)

    df = test_results.select("text", "tag", "prediction", "probability").toPandas()
    
    positive_score = [prob[1] for prob in df['probability']]
    df['probability'] = positive_score

    percents = ["{:.2%}".format(prob) for prob in df['probability']]
    df['percent'] = percents

    df.loc[df['prediction'] == 1.0, 'prediction'] = 'Positive'
    df.loc[df['prediction'] == 0.0, 'prediction'] = 'Negative'

    top_10 = df.sort_values(by=['probability'], ascending=False).head(10)
    bottom_10 = df.sort_values(by=['probability'], ascending=True).head(10)
    
    return(df, top_10, bottom_10)

In [None]:
from xml.etree.ElementTree import tostring
from flask import Flask, render_template, redirect, url_for, request, session
import json



#pull the rules from the textarea input box
rules = '{"rules" : [{"value": "Apple -is:retweet lang:en -has:links -has:media", "tag": "Apple"},{"value": "#WWDC22 -is:retweet lang:en -has:links -has:media", "tag": "#WWDC22"},{"value": "#LoveIsland -is:retweet lang:en -has:links -has:media", "tag": "#LoveIsland"},{"value": "Proud Boys -is:retweet lang:en -has:links -has:media", "tag": "Proud Boys"},{"value": "iOS 16 -is:retweet lang:en -has:links -has:media", "tag": "iOS 16"},{"value": "Aaron Donald -is:retweet lang:en -has:links -has:media", "tag": "Aaron Donald"},{"value": "Wilbur -is:retweet lang:en -has:links -has:media", "tag": "Wilbur"},{"value": "Giant Bomb -is:retweet lang:en -has:links -has:media", "tag": "Giant Bomb"},{"value": "Jocelyn Alo -is:retweet lang:en -has:links -has:media", "tag": "Jocelyn Alo"},{"value": "Michigan -is:retweet lang:en -has:links -has:media", "tag": "Michigan"}]}'

#pull the number of Tweets to request from the Twitter API
countOfTweets = 25
print(f'Count of Tweets: {str(countOfTweets)}')
if countOfTweets is None:
    countOfTweets = 10

print('rules: type: ' + rules)

#Perform the steps needed to receive the twitter stream

rules = json.loads(rules)
#get the previous rules
old_rules = get_rules()

#delete the previous rules
delete = delete_all_rules(old_rules)

#set the rules to be the new rules
set = set_rules(rules["rules"])


In [None]:
#Start the twitter stream with the requested rule set
tweet_list = get_stream(countOfTweets) 

In [None]:
#Send the collected twitter feed to the machine learning model
eval_list, top_10, bottom_10 = eval_text_list(tweet_list)  

In [None]:
#print the returned eval_list
top_10

In [None]:
bottom_10