# Example from www.nltk.org/howto/sentiment.html
# Testing different SA methods
## nltk's sentiment analyzer

In [101]:
import nltk
import csv
import pickle

In [128]:
# This is specific to the dataset
negative_label = 'negative'
neutral_label = 'neutral'
positive_label = 'positive'
irrelevant_label = 'irrelevant'
with open('./../../data/external/sanders.csv') as csv_file:
    iterator = csv.reader(csv_file, delimiter=',')
    # Only keep entries with positive or negative sentiment
    tweets = [(tweet, sentiment) for (topic, sentiment, id, tweet) in iterator if sentiment == negative_label or sentiment == positive_label]

In [129]:
tweets[0]

('"@Twitter CEO points to @Apple as \'corporate mentor\' as @iOS signups triple http://t.co/GCY8iphN"',
 'positive')

In [130]:
len(tweets)

945

In [131]:
# This is also specific to the dataset
def preprocess(tweets):
    tokenizer = nltk.tokenize.TweetTokenizer()
    preprocessed_tweets = []
    for (text, sentiment) in tweets:
        words_filtered = tokenizer.tokenize(text)
        preprocessed_tweets.append((words_filtered, sentiment))
    return preprocessed_tweets

In [132]:
# Preprocess tweets
preprocessed_tweets = preprocess(tweets)
preprocessed_tweets[0]

(['"',
  '@Twitter',
  'CEO',
  'points',
  'to',
  '@Apple',
  'as',
  "'",
  'corporate',
  'mentor',
  "'",
  'as',
  '@iOS',
  'signups',
  'triple',
  'http://t.co/GCY8iphN',
  '"'],
 'positive')

In [134]:
# Split between training and test set
# There are 4345 tweets in the dataset
train_tweets = preprocessed_tweets[:900]
test_tweets = preprocessed_tweets[900:]

In [135]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([nltk.sentiment.util.mark_negation(doc) for doc in train_tweets])

We use simple unigram word features, handling negation:

In [136]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)

496

In [137]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)

We apply features to obtain a feature-value representation of our datasets:

In [138]:
training_set = sentim_analyzer.apply_features(train_tweets)
test_set = sentim_analyzer.apply_features(test_tweets)

We can now train our classifier on the training set, and subsequently output the evaluation results:

In [139]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
# TODO check source (sadly the dataset cannot be downloaded)
# nltk.sentiment.util.demo_tweets(trainer)

Training classifier


In [140]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.24444444444444444
F-measure [negative]: 0.39285714285714285
Precision [negative]: 1.0
Recall [negative]: 0.24444444444444444


### TODO
- Do this on a twitter training set

In [98]:
# TODO this has to bring the tweets in about the same format as the built-in training set
def clean_split(text):
    return [word for word in text.split()
                            if 'http' not in word
                                and not word.startswith('@')
                                and not word.startswith('#')
                                and word != 'RT'
                            ]

In [99]:
# Evaluate classifier performance on twitter test set
import os
print(sentim_analyzer.classify(["I","hate","twitter","so","much"])) # "obj" --> objective

with open('./../../data/external/umich_sentiment_labeled.txt') as twitter_test_set_file:
    twitter_test_set = csv.reader(twitter_test_set_file, delimiter='\t')
    # Bring the data in the format expected by the evaluate-function
    evaluation_docs = [(clean_split(doc[1]), 'subj' if doc[0] == '1' else 'obj') for doc in twitter_test_set]
    #print(evaluation_docs[:10])
    evaluation_set = sentim_analyzer.apply_features(evaluation_docs)
    for key,value in sorted(sentim_analyzer.evaluate(evaluation_set).items()):
        print('{0}: {1}'.format(key, value))

# The Problem: the classifier is trained on subjectivity and objectivity, not positive/negative!

obj
Evaluating NaiveBayesClassifier results...
Accuracy: 0.49161607400982943
F-measure [obj]: 0.5737486365289055
F-measure [subj]: 0.3702775290957923
Precision [obj]: 0.44863532979529946
Precision [subj]: 0.6297198538367844
Recall [obj]: 0.7956302521008404
Recall [subj]: 0.26223687547552627


In [100]:
# SAVE AND LOAD

# optional to save your classifier so you can load it elsewhere without having to rebuild training set every time
save_classifier = open("tweetposneg.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

# optional load from classifier that was saved previously
# classifier_f = open("naivebayes.pickle", "rb")
# classifier = pickle.load(classifier_f)
# classifier_f.close()