In [10]:
import pandas as pd

from nltk.classify import NaiveBayesClassifier
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

# Useful references:
# https://towardsdatascience.com/basic-binary-sentiment-analysis-using-nltk-c94ba17ae386

# NLTK provides for some other sorts of features
# from nltk.sentiment.util import (mark_negation, extract_unigram_feats)
# mark_negation(): Append _NEG suffix to words that appear in the scope between
# a negation and a punctuation mark. extract_unigram_feats():
# Populate a dictionary of unigram features, reflecting the presence/absence
# in the document of each of the tokens in unigrams.

# Data
PROC_DIR = 'data/'
TRAIN = PROC_DIR + 'train.csv'
DEV =  PROC_DIR + 'dev.csv'
# In a previous step, I tokenized and pre-processed data and written
# out to a csv file.

df_train = pd.read_csv(TRAIN)
df_dev = pd.read_csv(DEV)

In [11]:
df_train = pd.DataFrame(df_train,columns=['id','label','text'])

In [12]:
# Feature extraction
df_pos_train = df_train[df_train['label'] == 'positive']
pos_tweets = df_pos_train['text'].tolist()

df_neg_train = df_train[df_train['label'] == 'negative']
neg_tweets = df_neg_train['text'].tolist()

df_neutral_train = df_train[df_train['label'] == 'neutral']
neutral_tweets = df_neutral_train['text'].tolist()

In [13]:
# how balanced is this training set?
len(df_pos_train)

3094

In [14]:
len(df_neg_train)

863

In [15]:
len(df_neutral_train)

2043

In [16]:
def features(sentence):
    words = sentence.lower().split()
    return dict(('contains(%s)' % w, True) for w in words)

positive_featuresets = [(features(tweet),'positive') for tweet in pos_tweets]
negative_featuresets = [(features(tweet),'negative') for tweet in neg_tweets]
neutral_featuresets = [(features(tweet),'neutral') for tweet in neutral_tweets]
training_features = positive_featuresets + negative_featuresets + neutral_featuresets

In [17]:
len(training_features)

6000

In [18]:
sentiment_analyzer = SentimentAnalyzer()
trainer = NaiveBayesClassifier.train
classifier = sentiment_analyzer.train(trainer, training_features)

Training classifier


In [19]:
# Create evaluation data

#df_dev = pd.DataFrame(df_dev,columns=['id','label','text'])
truth_list = list(df_dev[['text', 'label']].itertuples(index=False, name=None))
len(truth_list)

1999

In [20]:
# sanity check to make sure we manipulated the dataframe properly
truth_list[100]

('barack michelle obama walk runway style tonight tomorrow life complete',
 'positive')

In [21]:
# The evaluation method needs the feature extractor that was run to train the classifier
# Specifically, it wants a list of tuples (features,truth), where features is a dict
for i, (text, expected) in enumerate(truth_list):
    text_feats = features(text)
    truth_list[i] = (text_feats, expected)
truth_list[100]

({'contains(barack)': True,
  'contains(michelle)': True,
  'contains(obama)': True,
  'contains(walk)': True,
  'contains(runway)': True,
  'contains(style)': True,
  'contains(tonight)': True,
  'contains(tomorrow)': True,
  'contains(life)': True,
  'contains(complete)': True},
 'positive')

In [22]:
# evaluate and print out all metrics
sentiment_analyzer.evaluate(truth_list,classifier)

Evaluating NaiveBayesClassifier results...


{'Accuracy': 0.4807403701850925,
 'Precision [positive]': 0.6909620991253644,
 'Recall [positive]': 0.5622775800711743,
 'F-measure [positive]': 0.6200130804447351,
 'Precision [negative]': 0.31548311990686845,
 'Recall [negative]': 0.6930946291560103,
 'F-measure [negative]': 0.4336,
 'Precision [neutral]': 0.47577092511013214,
 'Recall [neutral]': 0.2823529411764706,
 'F-measure [neutral]': 0.3543888433141919}

In [23]:
# example of how to get to individual metrics
for key,value in sorted(sentiment_analyzer.evaluate(truth_list).items()):
    print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.4807403701850925
F-measure [negative]: 0.4336
F-measure [neutral]: 0.3543888433141919
F-measure [positive]: 0.6200130804447351
Precision [negative]: 0.31548311990686845
Precision [neutral]: 0.47577092511013214
Precision [positive]: 0.6909620991253644
Recall [negative]: 0.6930946291560103
Recall [neutral]: 0.2823529411764706
Recall [positive]: 0.5622775800711743
