**Text mining project: sentiment analysis**
<br>Group 49

In [12]:
import pandas as pd
test_data = pd.read_csv('sentiment-topic-final-test.tsv',sep='\t')

In [14]:
list(test_data.columns)

['sentence id', 'text', 'sentiment', 'topic']

In [16]:
import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
vader_model = SentimentIntensityAnalyzer()
import spacy
nlp = spacy.load('en_core_web_sm')
import sklearn

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


**Rule-based sentiment analysis (VADER)**

In [17]:
def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=None,
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = vader_model.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores

In [18]:
def vader_output_to_label(vader_output):
    """
    map vader output e.g.,
    {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.4215}
    to one of the following values:
    a) positive float -> 'positive'
    b) 0.0 -> 'neutral'
    c) negative float -> 'negative'
    
    :param dict vader_output: output dict from vader
    
    :rtype: str
    :return: 'negative' | 'neutral' | 'positive'
    """
    compound = vader_output['compound']
    
    if compound < 0:
        return 'negative'
    elif compound == 0.0:
        return 'neutral'
    elif compound > 0.0:
        return 'positive'
    
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.0}) == 'neutral'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': 0.01}) == 'positive'
assert vader_output_to_label( {'neg': 0.0, 'neu': 0.0, 'pos': 1.0, 'compound': -0.01}) == 'negative'

In [19]:
tweets = []
all_vader_output = []
gold = []

for index, row in test_data.iterrows():
    the_tweet = row['text']
    vader_output = run_vader(the_tweet, lemmatize=True)# run vader
    vader_label = vader_output_to_label(vader_output)# convert vader output to category
    
    tweets.append(the_tweet)
    all_vader_output.append(vader_label)
    gold.append(row['sentiment'])

In [20]:
# use scikit-learn's classification report
# Qualitative evaluation
print(sklearn.metrics.classification_report(gold, all_vader_output))

              precision    recall  f1-score   support

    negative       0.50      0.33      0.40         3
     neutral       0.00      0.00      0.00         3
    positive       0.50      1.00      0.67         4

    accuracy                           0.50        10
   macro avg       0.33      0.44      0.36        10
weighted avg       0.35      0.50      0.39        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
#Error analysis
# Positives misclasified
positives_misclasified_indices = []
for i in range(len(gold)):
    if gold[i] == "positive" and gold[i] != all_vader_output[i]:
        positives_misclasified_indices.append(i)
        print("Sentence {} should be {}. Misclassified as {}.".format(i, gold[i], all_vader_output[i]))

print("\nNumber of positives misclassified: {}\n".format(len(positives_misclasified_indices)))

for i in positives_misclasified_indices: print(i, tweets[i])


Number of positives misclassified: 0



In [22]:
# Negatives misclasified
positives_misclasified_indices = []
for i in range(len(gold)):
    if gold[i] == "negative" and gold[i] != all_vader_output[i]:
        positives_misclasified_indices.append(i)
        print("Sentence {} should be {}. Misclassified as {}.".format(i, gold[i], all_vader_output[i]))

print("\nNumber of negatives misclassified: {}\n".format(len(positives_misclasified_indices)))

for i in positives_misclasified_indices: print(i, tweets[i])

Sentence 2 should be negative. Misclassified as positive.
Sentence 9 should be negative. Misclassified as positive.

Number of negatives misclassified: 2

2 This Italian place is really trendy but they have forgotten about the most important part of a restaurant, the food.
9 I always have loved English novels, but I just couldn't get into this one.


In [23]:
# Neutrals misclasified
positives_misclasified_indices = []
for i in range(len(gold)):
    if gold[i] == "neutral" and gold[i] != all_vader_output[i]:
        positives_misclasified_indices.append(i)
        print("Sentence {} should be {}. Misclassified as {}.".format(i, gold[i], all_vader_output[i]))

print("\nNumber of neutrals misclassified: {}\n".format(len(positives_misclasified_indices)))

for i in positives_misclasified_indices: print(i, tweets[i])

Sentence 4 should be neutral. Misclassified as positive.
Sentence 5 should be neutral. Misclassified as positive.
Sentence 8 should be neutral. Misclassified as negative.

Number of neutrals misclassified: 3

4 The story of this movie is focused on Carl Brashear played by Cuba Gooding Jr. who wants to be the first African American deep sea diver in the navy.
5 Chris O'Donnell stated that while filming for this movie, he felt like he was in a toy commercial.
8 The new movie by Mr. Kruno was shot in New York, but the story takes place in Los Angeles.


**Sentiment analysis with transformers**

Link to the pre-trained transformer model: https://huggingface.co/Souvikcmsa/BERT_sentiment_analysis

In [3]:
!conda install pytorch cpuonly -c pytorch
!pip install transformers

In [25]:
from transformers import pipeline

In [27]:
classifier = pipeline("text-classification", model = "Souvikcmsa/BERT_sentiment_analysis")

In [4]:
#TEST: DELETE LATER!!!
smth = classifier(test_data['text'][0])
print(smth[0]['label'])

NameError: name 'classifier' is not defined

In [44]:
s one.tweets = []
all_transformer_output = []
gold = []

for index, row in test_data.iterrows():
    the_tweet = row['text']
    transformer_output_label = classifier(the_tweet)[0]['label']# run transformer
    
    tweets.append(the_tweet)
    all_transformer_output.append(transformer_output_label)
    gold.append(row['sentiment'])

In [45]:
# use scikit-learn's classification report
# Qualitative evaluation
print(sklearn.metrics.classification_report(gold, all_transformer_output))

              precision    recall  f1-score   support

    negative       1.00      0.67      0.80         3
     neutral       1.00      1.00      1.00         3
    positive       0.80      1.00      0.89         4

    accuracy                           0.90        10
   macro avg       0.93      0.89      0.90        10
weighted avg       0.92      0.90      0.90        10



In [49]:
#Error analysis
# Positives misclasified
positives_misclasified_indices = []
for i in range(len(gold)):
    if gold[i] == "positive" and gold[i] != all_transformer_output[i]:
        positives_misclasified_indices.append(i)
        print("Sentence {} should be {}. Misclassified as {}.".format(i, gold[i], all_transformer_output[i]))

print("\nNumber of positives misclassified: {}\n".format(len(positives_misclasified_indices)))

for i in positives_misclasified_indices: print(i, tweets[i])


Number of positives misclassified: 0



In [50]:
# Negatives misclasified
positives_misclasified_indices = []
for i in range(len(gold)):
    if gold[i] == "negative" and gold[i] != all_transformer_output[i]:
        positives_misclasified_indices.append(i)
        print("Sentence {} should be {}. Misclassified as {}.".format(i, gold[i], all_transformer_output[i]))

print("\nNumber of negatives misclassified: {}\n".format(len(positives_misclasified_indices)))

for i in positives_misclasified_indices: print(i, tweets[i])

Sentence 9 should be negative. Misclassified as positive.

Number of negatives misclassified: 1

9 I always have loved English novels, but I just couldn't get into this one.


In [52]:
# Neutrals misclasified
positives_misclasified_indices = []
for i in range(len(gold)):
    if gold[i] == "neutral" and gold[i] != all_transformer_output[i]:
        positives_misclasified_indices.append(i)
        print("Sentence {} should be {}. Misclassified as {}.".format(i, gold[i], all_transformer_output[i]))

print("\nNumber of neutrals misclassified: {}\n".format(len(positives_misclasified_indices)))

for i in positives_misclasified_indices: print(i, tweets[i])


Number of neutrals misclassified: 0

