In [46]:
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

import os
import csv
import random

# Set the path to the credentials downloaded from the google api console
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../../../google_access.json'

In [47]:
# Read all the tweets we want
# We want all the tweets in this case, since we save them and perform the analysis later.
negative_label = 'negative'
neutral_label = 'neutral'
positive_label = 'positive'
irrelevant_label = 'irrelevant'
with open('./../../../data/external/sanders.csv') as csv_file:
    iterator = csv.reader(csv_file, delimiter=',')
    tweets = [(tweet, sentiment, id) for (topic, sentiment, id, tweet) in iterator]
    
len(tweets)

4345

In [48]:
# Only take 10 tweets so we don't exceed the google cloud usage limit during testing the following code
# Comment this out for the real thing
#random.shuffle(tweets)
tweets = tweets[816:]
#len(tweets) # --> 10
tweets[0]

("'@bassponton @iphoneclub @kpn @iphone @apple ja. En dan heb je voor dat geld ook nog een datalimiet....'",
 'neutral',
 '125318029390249984')

In [49]:
# Some utility functions for preprocessing
def remove_urls(text):
    return re.sub(r"http\S+", "", text)
def preprocess(tweet):
    # Remove all special characters
    return " ".join([word for word in re.split("[\s;,.#:-@!?'\"]", remove_urls(tweet)) if word.isalpha()])

In [50]:
# Perform analysis on all tweets and write to file
client = language.LanguageServiceClient()
with open('../../../data/interim/sanders_gcloud.csv', 'a', newline='') as out_file:
    csv_writer = csv.writer(out_file, delimiter=',')
    for (tweet,sentiment,id) in tweets:
        # Create the document from the tweet
        document = types.Document(
            content=tweet,
            # PLAIN_TEXT or HTML
            type=enums.Document.Type.PLAIN_TEXT)
        # Analyze the sentiment
        try:
            annotations = client.analyze_sentiment(document=document)
            # Write the results to the out file
            csv_writer.writerow((tweet,sentiment,id,annotations.document_sentiment.score,annotations.document_sentiment.magnitude))
        except:
            # Write the tweet and an indicator that it didn't work
            csv_writer.writerow((tweet,sentiment,id,"N/A","N/A"))

    out_file.close()

In [53]:
# Check how many were labeled correctly
client = language.LanguageServiceClient()
with open('../../../data/interim/sanders_gcloud.csv', 'r', newline='') as in_file:
    csv_reader = csv.reader(in_file, delimiter=',')
    labeled_correctly = 0
    # Load all the tweets into memory
    # Do filtering here
    tweets = [(text, sentiment, id, score, magnitude) for (text, sentiment, id, score, magnitude) in csv_reader 
              if sentiment == positive_label 
              or sentiment == negative_label]
    valid_tweets = [(text, sentiment, id, score, magnitude) for (text, sentiment, id, score, magnitude) in tweets 
              if score != "N/A" 
              and magnitude != "N/A" ]
        
print("%d tweets read, %d valid" % (len(tweets),len(valid_tweets)))

945 tweets read, 945 valid


In [54]:
    
    for (text,sentiment,id, score, magnitude) in valid_tweets:
        if (float(score) < 0 and sentiment == negative_label) or (float(score) > 0 and sentiment == positive_label):
            labeled_correctly += 1
#        else:
#            print(preprocess(tweet))
#            print("Is %s, was labeled %s" % (sentiment, polarity))
    print("%d/%d = %.2d percent" % (labeled_correctly, len(tweets), labeled_correctly/len(tweets)*100))

588/945 = 62 percent
