In [108]:
import requests
import os
import json

# To set your enviornment variables in your terminal run the following line:
# export 'BEARER_TOKEN'='<your_bearer_token>'
os.environ["BEARER_TOKEN"] = "AAAAAAAAAAAAAAAAAAAAACfadAEAAAAArANTZpoNHhO8mC%2ByIs3OngWhwyk%3DkOMsCWKWvyFmcwZb1mEstEMIdb03OobXzNsgHpU87SXWvWct0C"

bearer_token = os.environ.get("BEARER_TOKEN")

def bearer_oauth(r):
    """
    Method required by bearer token authentication.
    """

    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2FilteredStreamPython"
    return r


def get_rules():
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream/rules", auth=bearer_oauth
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))
    return response.json()


def delete_all_rules(rules):
    if rules is None or "data" not in rules:
        return None

    ids = list(map(lambda rule: rule["id"], rules["data"]))
    payload = {"delete": {"ids": ids}}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        auth=bearer_oauth,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot delete rules (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    print(json.dumps(response.json()))


def set_rules(delete):
    # You can adjust the rules if needed
    sample_rules = [
        {"value": "dog has:images", "tag": "dog pictures"},
        {"value": "cat has:images -grumpy", "tag": "cat pictures"},
    ]
    payload = {"add": sample_rules}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        auth=bearer_oauth,
        json=payload,
    )
    if response.status_code != 201:
        raise Exception(
            "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))


def get_stream(countOfTweets):
    countOfTweets = int(countOfTweets)
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream", auth=bearer_oauth, stream=True,
    )
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Cannot get stream (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    all_responses = []
    count = 0
    for response_line in response.iter_lines():
        if count >= countOfTweets:
            break;

        if response_line:
            # get the current tweet json
            json_response = json.loads(response_line)    
 
            # Keep count of the processed tweets
            count = count + 1 

            # initialize a dict to hold the current tweet
            tweet = {}

            # extract the data from the tweet and store it in our variable
            tweet['count'] = count
            tweet['id'] = json_response["data"]["id"]
            tweet["text"] = json_response["data"]["text"]
            tweet["tag"] = json_response["matching_rules"][0]["tag"]


            # add the tweet to our response list
            all_responses.append(tweet)

            #print(json.dumps(json_response, indent=4, sort_keys=True))
            print('tweets streamed: ' + str(count))
    #take the streamed responses, put it into a dataframe and print the dataframe        
    #df = pd.DataFrame(all_responses)
    #print(df)

    #return the streamed responses         
    return(all_responses)


def main():
    rules = get_rules()
    delete = delete_all_rules(rules)
    set = set_rules(delete)
    results = get_stream(100)
    return(results)




In [109]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark import SparkFiles
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer
from pyspark.sql.functions import length
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
from pyspark.ml import Pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel

# Create the spark session
spark = SparkSession.builder.appName("Twitter_Sentiment_NLP").getOrCreate()

#Pre-Load the classifier and the model
# Load the saved NaiveBayes Classifier
nb = NaiveBayes.load("../static/resources/nb")

#Restored the trained predictor (Trained on 1 mil tweets)
predictor = NaiveBayesModel.load("../static/resources/nb_model")

def eval_text_single(text, polarity = 1.0):
    list = [
    {"polarity": polarity, "text" : text}
    ]

    # The pipeline doesn't work as well when it it just one record in the list, so creating a fake list and adding the request to it.
    text_list = [{"text": "I am so happy for this text!  I can now have everything I want.", "polarity": 1.0},
             {"text": "This sucks!  I don't like this anymore.", "polarity": 0.0},
             {"text" : "This is a bad text.", "polarity" : 0.0},
             {"text": "I love you.", "polarity": 0.0},
             {"text": "Wow!  I can't believe how great this is.", "polarity": 0.0},
            ]
    
    # Add to the fake list
    text_list.append({"text" : text, "polarity" : polarity})
    
    tweet_df = spark.createDataFrame(text_list)

    # Create a length column to be used as a future feature
    data_df = tweet_df.withColumn('length', length(tweet_df['text']))

    # Create all the features to the data set
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopRemove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
    hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
    idf = IDF(inputCol='hash_token', outputCol='idf_token')

    # Create feature vectors
    clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

    # Create and run a data processing Pipeline
    data_prep_pipeline = Pipeline(stages=[tokenizer, stopRemove, hashingTF, idf, clean_up])

    # Fit and transform the pipeline
    cleaner = data_prep_pipeline.fit(data_df)
    cleaned = cleaner.transform(data_df)

    # Load the saved NaiveBayes Classifier
    #nb = NaiveBayes.load("static/resources/nb")

    #Restored the trained predictor (Trained on 1 mil tweets)
    #predictor = NaiveBayesModel.load("static/resources/nb_model")

    #Predict the sentiment of the text using the restored predictor
    test_results = predictor.transform(cleaned)

    df = test_results.select("text","prediction", "probability").toPandas()

    positives = [prob[1] for prob in df['probability']]
    df['probability'] = positives
    
    #Prepare the results, show the first row 
    result = {}
    result["text"] = df["text"][5]
    result["prediction"] = df["prediction"][5]
    result["probability"] = df["probability"][5]

    if result["prediction"] == 1:
        result["prediction"] = "Positive"
    else: 
        result["prediction"] = "Negative"

    return(result)

def eval_text_list(text_list):

    tweet_df = spark.createDataFrame(text_list)

    # Create a length column to be used as a future feature
    data_df = tweet_df.withColumn('length', length(tweet_df['text']))

    # Create all the features to the data set
    tokenizer = Tokenizer(inputCol="text", outputCol="token_text")
    stopRemove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
    hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
    idf = IDF(inputCol='hash_token', outputCol='idf_token')

    # Create feature vectors
    clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

    # Create and run a data processing Pipeline
    data_prep_pipeline = Pipeline(stages=[tokenizer, stopRemove, hashingTF, idf, clean_up])

    # Fit and transform the pipeline
    cleaner = data_prep_pipeline.fit(data_df)
    cleaned = cleaner.transform(data_df)

    # Load the saved NaiveBayes Classifier
    #nb = NaiveBayes.load("static/resources/nb")

    #Restored the trained predictor (Trained on 1 mil tweets)
    #predictor = NaiveBayesModel.load("static/resources/nb_model")

    #Predict the sentiment of the text using the restored predictor
    test_results = predictor.transform(cleaned)

    df = test_results.select("text", "tag", "prediction", "probability").toPandas()

    positive_score = [prob[1] for prob in df['probability']]
    
    df['probability'] = positive_score

    #Format the percentage 
    percents = ["{:.2%}".format(prob) for prob in df['probability']]
    df['percent'] = percents

    #Rename the prediction to a text value
    df.loc[df['prediction'] == 1.0, 'prediction'] = 'Positive'
    df.loc[df['prediction'] == 0.0, 'prediction'] = 'Negative'

    # pull out the top and bottom 10 records.  
    top_10 = df.sort_values(by=['probability'], ascending=False).head(10)
    bottom_10 = df.sort_values(by=['probability'], ascending=True).head(10)
    
    return(df, top_10.to_dict('records'), bottom_10.to_dict('records'))
    

In [110]:
if __name__ == "__main__":
    results = main()

{"data": [{"id": "1534331066772619264", "value": "cat has:images -grumpy", "tag": "cat pictures"}, {"id": "1534331066772619265", "value": "dog has:images", "tag": "dog pictures"}], "meta": {"sent": "2022-06-08T01:25:24.280Z", "result_count": 2}}
{"meta": {"sent": "2022-06-08T01:25:24.489Z", "summary": {"deleted": 2, "not_deleted": 0}}}
{"data": [{"value": "cat has:images -grumpy", "tag": "cat pictures", "id": "1534345818785730565"}, {"value": "dog has:images", "tag": "dog pictures", "id": "1534345818785730564"}], "meta": {"sent": "2022-06-08T01:25:24.673Z", "summary": {"created": 2, "not_created": 0, "valid": 2, "invalid": 0}}}
200
tweets streamed: 1
tweets streamed: 2
tweets streamed: 3
tweets streamed: 4
tweets streamed: 5
tweets streamed: 6
tweets streamed: 7
tweets streamed: 8
tweets streamed: 9
tweets streamed: 10
tweets streamed: 11
tweets streamed: 12
tweets streamed: 13
tweets streamed: 14
tweets streamed: 15
tweets streamed: 16
tweets streamed: 17
tweets streamed: 18
tweets st

In [15]:
import json
#include json library
import json

#json string data
employee_string = '{"rules" : [{"value": "dog has:images", "tag": "dog pictures"},{"value": "cat has:images -grumpy", "tag": "cat pictures"}]}'

#convert string to  object
json_object = json.loads(employee_string)

#check new data type
print(type(json_object["rules"]))

#output
#<class 'dict'>

<class 'list'>


In [145]:
df, top_10, bottom_10 = eval_text_list(results)

22/06/07 19:39:40 WARN DAGScheduler: Broadcasting large task binary with size 8.1 MiB


In [181]:
def format_results_for_plotting(df):

    #df with the total count
    totals = df.groupby(['prediction']).size().reset_index(name='counts')

    #list with the distinct tags
    tags = df.groupby(['tag']).size().reset_index(name='counts')['tag'].to_list()

    #list with the distinct predictions
    predictions = df.groupby(['prediction']).size().reset_index(name='counts')['prediction'].to_list()

    # create a df with the aggregate counts
    agg_bytag = df.groupby(['tag','prediction']).size().reset_index(name='counts')


    #Create lists for the positive and negative counts
    positives = []
    negatives = []
    for tag in tags:
        for prediction in predictions:
            if prediction == "Positive":
                try:
                    pos = int(agg_bytag.loc[(agg_bytag["tag"] == tag) & (agg_bytag["prediction"] == prediction)]['counts'].values[0])
                except:
                    pos = int(0)
                positives.append(pos)
            if prediction == "Negative":
                try:
                    neg = int(agg_bytag.loc[(agg_bytag["tag"] == tag) & (agg_bytag["prediction"] == prediction)]['counts'].values[0]) 
                except:
                    neg = int(0)
                negatives.append(neg)

    #Create the dataset for the bar graph
    plot_data = {} 
    plot_data['tags'] = tags
    plot_data['positives'] = positives
    plot_data['negatives'] = negatives   

    #Create the full dataset for plotting
    data = {}
    data["plot_data"] = plot_data
    data["tags"] = tags
    data["totals"]={}
    data["totals"]["predictions"] = totals['prediction'].to_list()
    data["totals"]["counts"] = totals['counts'].to_list()
    data["total_count"] = int(totals['counts'].sum())
    data["positive_count"] = int(sum(positives))
    
    return(data)

data = format_results_for_plotting(df)
print(data)
with open('../static/resources/evaluated_tweets.json', 'w') as fp:
    json.dump(data, fp)

{'plot_data': {'tags': ['cat pictures', 'dog pictures'], 'positives': [41, 11], 'negatives': [31, 17]}, 'tags': ['cat pictures', 'dog pictures'], 'totals': {'predictions': ['Negative', 'Positive'], 'counts': [48, 52]}, 'total_count': 100, 'positive_count': 52}


[{'prediction': 'Negative', 'counts': 48},
 {'prediction': 'Positive', 'counts': 52}]

In [133]:
import json
# Serializing json  
json_object = json.dumps(plot_data, indent = 4) 
print(json_object)

{
    "tags": [
        "cat pictures",
        "dog pictures"
    ],
    "positives": [
        41,
        11
    ],
    "negatives": [
        31,
        17
    ]
}
