# Sentiment Analysis
<p> We are comparing two different models of sentiment analysis to see what model produces more accurate results. We are running this against our Kaggle dataset for training. </p>

### Sentiment analysis Model #1 -- TextBlob and Naive Bayes

In [30]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import geopandas as gp

import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')

from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/shrutikorada/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shrutikorada/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
tweets = pd.read_csv('Tweets.csv')
frame = frame.reset_index()
tweets = tweets.join(frame)
tweets = tweets.rename(columns={"tweet":"Tweet"})
tweets

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone,level_0,Tweet,cluster
0,570306133677760513,neutral,1.0000,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada),2,What said.,2
1,570301130888122368,positive,0.3486,,0.0000,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada),2,plus you've added commercials to the eerience....,2
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada),2,I didn't today... Must mean I need to take ano...,2
3,570301031407624196,negative,1.0000,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada),0,"it's really aggressive to blast obnoxious ""ent...",0
4,570300817074462722,negative,1.0000,Can't Tell,1.0000,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada),0,and it's a really big bad thing about it,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14635,569587686496825344,positive,0.3487,,0.0000,American,,KristenReenders,,0,@AmericanAir thank you we got on a different f...,,2015-02-22 12:01:01 -0800,,,2,thank you we got on a different flight to Chic...,2
14636,569587371693355008,negative,1.0000,Customer Service Issue,1.0000,American,,itsropes,,0,@AmericanAir leaving over 20 minutes Late Flig...,,2015-02-22 11:59:46 -0800,Texas,,2,leaving over minutes Late Flight. No warnings ...,2
14637,569587242672398336,neutral,1.0000,,,American,,sanyabun,,0,@AmericanAir Please bring American Airlines to...,,2015-02-22 11:59:15 -0800,"Nigeria,lagos",,2,Please bring American Airlines to,2
14638,569587188687634433,negative,1.0000,Customer Service Issue,0.6659,American,,SraJackson,,0,"@AmericanAir you have my money, you change my ...",,2015-02-22 11:59:02 -0800,New Jersey,Eastern Time (US & Canada),3,"you have my money, you change my flight, and d...",3


In [32]:
# assign sentiment scores
scores = []
for tweet in tweets['Tweet']:
    score = sia.polarity_scores(tweet)
    scores.append(score['compound'])
tweets['sentiment_scores'] = scores
tweets['sentiment_derived'] = ["positive" if w >0 else "negative" if w < 0 else "neutral" for w in tweets['sentiment_scores']]

In [33]:
# percent match between assigned and derived sentiment
tweets['match'] = (tweets['sentiment_derived']==tweets['airline_sentiment']).astype(int)
tweets[['airline_sentiment','sentiment_derived','match']]
tweets['match'].mean()

0.5450136612021858

In [34]:
# crosstab of assigned vs derived sentiment
pd.crosstab(tweets.airline_sentiment, tweets.sentiment_derived)

sentiment_derived,negative,neutral,positive
airline_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,4604,1775,2799
neutral,427,1331,1341
positive,97,222,2044


About 50% of the derived sentiment scores match the original scores. Most of the errors are negative or neutral tweets that are misclassified as neutral or positive. Assess additional sentiment analyzers to improve accuracy:

In [35]:
blobber = Blobber(analyzer=NaiveBayesAnalyzer())

scores = []
for tweet in tweets['Tweet']:
    score = TextBlob(tweet)
    scores.append(score.sentiment[0])
tweets['textblob_scores'] = scores
tweets['textblob_derived'] = ["positive" if w >0 else "negative" if w < 0 else "neutral" for w in tweets['textblob_scores']]

In [36]:
pd.crosstab(tweets.sentiment_derived, tweets.textblob_derived)

textblob_derived,negative,neutral,positive
sentiment_derived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,2385,1701,1042
neutral,424,2226,678
positive,676,1780,3728


In [37]:
def combined_sentiment(tweets):
    if (tweets['textblob_derived'] == 'negative') or (tweets['sentiment_derived'] == 'negative'):
        return 'negative'
    if (tweets['textblob_derived'] == 'neutral') and (tweets['sentiment_derived'] == 'positive'):
        return 'neutral'
    if (tweets['textblob_derived'] == 'positive') and (tweets['sentiment_derived'] == 'neutral'):
        return 'neutral'
    if (tweets['textblob_derived'] == 'neutral') and (tweets['sentiment_derived'] == 'neutral'):
        return 'negative'
    if (tweets['textblob_derived'] == 'positive') and (tweets['sentiment_derived'] == 'positive'):
        return 'positive'
    else:
        return '0'

In [38]:
tweets['final_derived'] = tweets.apply(combined_sentiment, axis=1)

In [39]:
pd.crosstab(tweets.final_derived, tweets.airline_sentiment)

airline_sentiment,negative,neutral,positive
final_derived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,6508,1623,323
neutral,1275,801,382
positive,1395,675,1658


In [40]:
# percent match between assigned and derived sentiment
tweets['match'] = (tweets['final_derived']==tweets['airline_sentiment']).astype(int)
tweets[['airline_sentiment','final_derived','match']]
tweets['match'].mean()

0.6125

Accuracy has improved moderately with a combination of sentiment classifiers. 

In [41]:
# % negative sentiment by cluster using derived sentiment
tweets['negative'] = np.where(tweets['final_derived']== 'negative', True, False)
tweets.groupby('cluster')['negative'].mean()

cluster
0    0.609630
1    0.622733
2    0.569824
3    0.514493
4    0.629333
Name: negative, dtype: float64

In [42]:
# % negative sentiment by cluster using sentiment from original dataset
tweets['negative_orig'] = np.where(tweets['airline_sentiment']== 'negative', True, False)
tweets.groupby('cluster')['negative_orig'].mean()

cluster
0    0.568358
1    0.700121
2    0.621626
3    0.526570
4    0.773333
Name: negative_orig, dtype: float64

Initial results indicate that Cluster 3 (key words: b'delayed', b'flight', b'hour', b'missed', b'connecting', b'plane') is the most negative.

In [43]:
tweets.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,...,Tweet,cluster,sentiment_scores,sentiment_derived,match,textblob_scores,textblob_derived,final_derived,negative,negative_orig
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,...,What said.,2,0.0,neutral,0,0.0,neutral,negative,True,False
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,...,plus you've added commercials to the eerience....,2,0.0,neutral,0,0.0,neutral,negative,True,False
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,...,I didn't today... Must mean I need to take ano...,2,0.0,neutral,0,-0.390625,negative,negative,True,False
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,...,"it's really aggressive to blast obnoxious ""ent...",0,-0.5984,negative,1,0.00625,positive,negative,True,True
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,...,and it's a really big bad thing about it,0,-0.5829,negative,1,-0.35,negative,negative,True,True


In [44]:
tweets['Tweet'][3]

'it\'s really aggressive to blast obnoxious "entertainment" in your guests\' faces & they have little recourse'

### Sentiment Model #2 -- Hugging Face

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

In [4]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []


    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [5]:
rm -r ./cardiffnlp

rm: ./cardiffnlp: No such file or directory


In [6]:
# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [7]:
# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [45]:
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
tokenizer.save_pretrained(MODEL)

In [48]:
final_scores = []

for tweet in tweets['Tweet']:
    
    text = tweet
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    # # TF
    # model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
    # model.save_pretrained(MODEL)

    # text = "Good night ðŸ˜Š"
    # encoded_input = tokenizer(text, return_tensors='tf')
    # output = model(encoded_input)
    # scores = output[0][0].numpy()
    # scores = softmax(scores)

    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        #print(f"{i+1}) {l} {np.round(float(s), 4)}")
        
    final_score = labels[ranking[0]]
    final_scores.append(final_score)

In [50]:
tweets['hugging_face'] = final_scores

In [53]:
#tweets.to_csv('tweets_sentiment_refined.csv')

In [51]:
pd.crosstab(tweets.hugging_face, tweets.airline_sentiment)

airline_sentiment,negative,neutral,positive
hugging_face,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,6287,357,39
neutral,2389,2208,298
positive,502,534,2026


In [52]:
# percent match between assigned and derived sentiment
tweets['match_hf'] = (tweets['hugging_face']==tweets['airline_sentiment']).astype(int)
tweets[['airline_sentiment','hugging_face','match_hf']]
tweets['match_hf'].mean()

0.7186475409836065

Implementing the Twitter-roBERTa-base model for Sentiment Analysis improves the model accuracy by 10%+ to 72%.