# Example from www.nltk.org/howto/sentiment.html
# Testing different SA methods
## nltk's sentiment analyzer

In [28]:
import nltk
import csv
import pickle
from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
import random

In [29]:
# This is specific to the dataset
negative_label = 'negative'
neutral_label = 'neutral'
positive_label = 'positive'
irrelevant_label = 'irrelevant'
with open('./../../../data/external/sanders.csv') as csv_file:
    iterator = csv.reader(csv_file, delimiter=',')
    # Only keep entries with positive or negative sentiment
    tweets = [(tweet, sentiment) for (topic, sentiment, id, tweet) in iterator] # if sentiment == negative_label or sentiment == positive_label
    # The tweets are ordered, so shuffle for the training/test split
    random.shuffle(tweets)

In [31]:
len(tweets)

4345

In [33]:
# Preprocess tweets
preprocessed_tweets = preprocess(tweets)
preprocessed_tweets[0]

(["'",
  '#Fcbk',
  'empieza',
  'a',
  'agregar',
  'nuevas',
  'aplicaciones',
  'volviendose',
  'cada',
  'vez',
  'mas',
  'obsoleto',
  'y',
  'aburrido',
  '...',
  '#Twitter',
  'es',
  'mejor',
  'que',
  '#fcbk',
  '#hedicho',
  '#jum',
  "'"],
 'irrelevant')

In [34]:
# Split between training and test set
# 90% training, 10% test
threshold = int(0.9 * len(preprocessed_tweets))
train_tweets = preprocessed_tweets[:threshold]
test_tweets = preprocessed_tweets[threshold:]

In [35]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([nltk.sentiment.util.mark_negation(doc) for doc in train_tweets])

We use simple unigram word features, handling negation:

In [36]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)

1678

In [37]:
sentim_analyzer.add_feat_extractor(nltk.sentiment.util.extract_unigram_feats, unigrams=unigram_feats)

We apply features to obtain a feature-value representation of our datasets:

In [38]:
training_set = sentim_analyzer.apply_features(train_tweets)
test_set = sentim_analyzer.apply_features(test_tweets)

We can now train our classifier on the training set, and subsequently output the evaluation results:

In [39]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
# TODO check source (sadly the dataset cannot be downloaded)
# nltk.sentiment.util.demo_tweets(trainer)

Training classifier


In [40]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.7172413793103448
F-measure [irrelevant]: 0.851190476190476
F-measure [negative]: 0.5346534653465346
F-measure [neutral]: 0.7191011235955056
F-measure [positive]: 0.36363636363636365
Precision [irrelevant]: 0.7944444444444444
Precision [negative]: 0.5192307692307693
Precision [neutral]: 0.7150837988826816
Precision [positive]: 0.5833333333333334
Recall [irrelevant]: 0.9166666666666666
Recall [negative]: 0.5510204081632653
Recall [neutral]: 0.7231638418079096
Recall [positive]: 0.2641509433962264


In [41]:
# SAVE AND LOAD

# optional to save your classifier so you can load it elsewhere without having to rebuild training set every time
save_classifier = open("./../../models/naive_bayes/nltk_sentiment_analyzer_all_labels.pickle","wb")
pickle.dump(sentim_analyzer, save_classifier)
save_classifier.close()