## Comparing Flair, VADER, and TextBlob
3 out of the box ready to use sentiment analysis models.

In [1]:
# load data:
import pandas as pd
import sqlite3
con = sqlite3.connect("test_data/sentiment_compare.db")
df = pd.read_sql("select * from sentiment_compare", con)
df.head()

Unnamed: 0,song_index,lyrics
0,saintjhn/roses,\n\r\nRoses\nI walked in the corner with the b...
1,dababy/rockstar,"\n\n[DaBaby:]\nWoo, woo\nI pull up like\nHow y..."
2,jawsh/savagelovelaxedsirenbeat,!1
3,juicewrld/comego,"\n\r\nWoah\nUh (Uh)\nOh, oh-oh, oh (Mello made..."
4,theweeknd/blindinglights,!1


In [29]:
df_test = df[df.lyrics != '!1']

In [3]:
len(df_test) #not bad!

83

In [73]:
print(df.song_index[0])
print(df.lyrics[0])

saintjhn/roses


Roses
I walked in the corner with the body screaming dolo
Never sold a bag but look like Pablo in a photo
This gon' make 'em feel the way like Tony killed Manolo
You already know though, you already know though
I walk in the corner with the money, on my finger
She might get it popping, I might wife her for the winter
I already know, already know, nigga roses
All I need is roses

Turn up baby, turn up, when I turn it on
You know how I get too lit when I turn it on
Can't handle my behavior when I turn it on
Too fast, never ask, if the life don't last
Done been through it all
Fuck with a nigga raw, this who you wanna be
And I know you won't tell nobody nothing
And I know you won't tell nobody no

Roses
I might pull up flexing on these niggas like aerobics
I might tell her girl you cute but balling
That shit gorgeous
Standing on the table, RosÃ©, RosÃ©, fuck the waters
You know who to the god is

Turn up baby, turn up, when I turn it on
You know how I get too lit when I tu

In [83]:
# Clean up lyrics: TBD
import re
import unidecode

def cleanlyrics(lyrics):
    """
    Returns the lines of input lyrics without any blanks,
    brackets, or accented characters.
    """
    # Remove brackets (), and remove all content in [] brackets
    lyrics = lyrics.replace('(','').replace(')','')
    lyrics = re.sub("[\[].*?[\]]", "", lyrics)

    # unaccent string:
    lyrics = unidecode.unidecode(lyrics)
    
    # Remove blank lines
    blanklines = True
    while blanklines:
        lyrics = lyrics.replace('\n\n','\n')
        if '\n\n' not in lyrics:
            blanklines = False
    
    # Split lines and remove blanks / single character lines:
    return [line for line in lyrics.split('\n') if len(line) > 1]

# Flair

In [6]:
import torch
from flair.models import TextClassifier
from flair.data import Sentence

# load tagger
classifier = TextClassifier.load('sentiment')

  from .autonotebook import tqdm as notebook_tqdm


2022-10-11 11:35:43,426 loading file /Users/williamcartar/.flair/models/sentiment-en-mix-distillbert_4.pt


In [7]:
def flair_sentiment(phrase):
    sentence = Sentence(phrase)
    # call predict
    classifier.predict(sentence)
    
    score = 0
    if sentence.tag == 'NEGATIVE':
        score = sentence.score * -1
    elif sentence.tag == 'POSITIVE':
        score = sentence.score

    return score

In [8]:
flair_sentiment('I hate you')

-0.9997355341911316

# VADER

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#note: depending on how you installed (e.g., using source code download versus pip install), you may need to import like this:
#from vaderSentiment import SentimentIntensityAnalyzer

vader_analyzer = SentimentIntensityAnalyzer()

In [10]:
def vader_sentiment(phrase):
    return vader_analyzer.polarity_scores(phrase)['compound']

In [11]:
vader_sentiment('I hate you')

-0.5719

# TextBlob

In [22]:
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [23]:
def textblob_sentiment(phrase):
    sentiment = TextBlob(phrase).sentiment
    return sentiment.polarity * sentiment.subjectivity
def textblobbayes_sentiment(phrase):
    sentiment = TextBlob(phrase, analyzer=NaiveBayesAnalyzer()).sentiment
    return sentiment

In [24]:
textblob_sentiment('I hate you')

-0.7200000000000001

In [25]:
textblobbayes_sentiment('I hate you')

Sentiment(classification='pos', p_pos=0.5153573426705306, p_neg=0.48464265732946954)

# Discussion
You'll see that Flair is the most certain of it's answers, which is because it was trained via IMDB reviews. Whereas VADER was trained off social media, it has the context that sometimes, people will say "I hate you", but really mean it in a positive way.

As we can see, textblob's default analyzer seems to handle basic sentences better. As such, we'll be using it and dropping the bayes analyzer.

# Compare!

In [86]:
def compare_analyzers(lyrics):
    lines = cleanlyrics(lyrics)
    scores = []
    analyzers = {'flair':0, 'vader': 0, 'textblob': 0}
    total_scores = {'flair':0, 'vader': 0, 'textblob': 0}
    for line in lines:
        scores.append(
            (
                line, {
                    'flair': flair_sentiment(line),
                    'vader': vader_sentiment(line),
                    'textblob': textblob_sentiment(line),
                }
            )
        )

        total_scores['flair'] += scores[-1][1]['flair']
        total_scores['vader'] += scores[-1][1]['vader']
        total_scores['textblob'] += scores[-1][1]['textblob']
        
    # Normalize:
    total_scores['flair'] = total_scores['flair'] / len(lines)
    total_scores['vader'] = total_scores['vader'] / len(lines)
    total_scores['textblob'] = total_scores['textblob'] / len(lines)

    return scores, total_scores

In [105]:
# Iterating over two columns, use `zip`
songs = [(x, y) for x, y in zip(df_test['song_index'], df_test['lyrics'])]

In [130]:
names = []
flair = []
vader = []
textblob = []

for song in songs:
    score, total = compare_analyzers(song[1])
    names.append(song[0])
    flair.append(total['flair'])
    vader.append(total['vader'])
    textblob.append(total['textblob'])
    

In [131]:
data = {
    'song_index': names,
    'flair': flair,
    'vader': vader,
    'textblob': textblob,
}
df_out = pd.DataFrame.from_dict(data)

In [132]:
df_out.head()

Unnamed: 0,song_index,flair,vader,textblob
0,saintjhn/roses,0.04262,-0.056735,0.001635
1,dababy/rockstar,0.344,0.140809,0.017396
2,juicewrld/comego,0.628804,0.311418,0.094232
3,harrystyles/watermelonsugar,-0.254719,0.062711,0.064287
4,jackharlow/whatspoppin,0.139775,-0.03298,-0.020325


# Evaluation:
Look at the distance of each analyzer away from the average, and when the distance is the greatest, who was right?

In [167]:
df_out['combined'] = (df_out['flair'] + df_out['vader'] + df_out['textblob']) / 3
df_out['flair_mse'] = (df_out['flair'] - df_out['combined']) * (df_out['flair'] - df_out['combined'])
df_out['vader_mse'] = (df_out['vader'] - df_out['combined']) * (df_out['vader'] - df_out['combined'])
df_out['textblob_mse'] = (df_out['textblob'] - df_out['combined']) * (df_out['textblob'] - df_out['combined'])

df_out.head()

Unnamed: 0,song_index,flair,vader,textblob,combined,flair_mse,vader_mse,textblob_mse
0,saintjhn/roses,0.04262,-0.056735,0.001635,-0.00416,0.002188,0.002764,3.4e-05
1,dababy/rockstar,0.344,0.140809,0.017396,0.167402,0.031187,0.000707,0.022502
2,juicewrld/comego,0.628804,0.311418,0.094232,0.344818,0.080648,0.001116,0.062793
3,harrystyles/watermelonsugar,-0.254719,0.062711,0.064287,-0.042574,0.045005,0.011085,0.011419
4,jackharlow/whatspoppin,0.139775,-0.03298,-0.020325,0.028823,0.01231,0.00382,0.002416


In [169]:
df_out.sort_values(['flair_mse'], ascending=False).head(2)

Unnamed: 0,song_index,flair,vader,textblob,combined,flair_mse,vader_mse,textblob_mse
75,highvalley/grewuponthat,0.78149,0.066798,0.045724,0.298004,0.233759,0.053456,0.063645
62,popsmoke/somethingspecial,0.735577,0.071358,0.032998,0.279978,0.207571,0.043522,0.060999


In [170]:
df_out.sort_values(['vader_mse'], ascending=False).head(2)

Unnamed: 0,song_index,flair,vader,textblob,combined,flair_mse,vader_mse,textblob_mse
19,popsmoke/forthenight,0.477935,-0.109602,-0.003272,0.121687,0.126913,0.053495,0.015615
75,highvalley/grewuponthat,0.78149,0.066798,0.045724,0.298004,0.233759,0.053456,0.063645


In [171]:
df_out.sort_values(['textblob_mse'], ascending=False).head(2)

Unnamed: 0,song_index,flair,vader,textblob,combined,flair_mse,vader_mse,textblob_mse
74,keithurban/godwhisperedyourname,0.646084,0.314365,0.026304,0.328918,0.100595,0.000212,0.091575
60,jonasbrothers/x,0.596392,0.192665,-0.000761,0.262766,0.111306,0.004914,0.069446


In [174]:
print(df_out['flair_mse'].sum()/len(df_out))
print(df_out['vader_mse'].sum()/len(df_out))
print(df_out['textblob_mse'].sum()/len(df_out))

0.04116013385695065
0.0095539242233443
0.01674504611027381


### Discussion:
Flair and Vader have large MSE **because** they have opinions. Whereas the MSE from textblob looks to be due to rather subdued decision process. Giving a sentiment close to 0 when really, it should be high. For this reason, I'm going to:
* remove TextBlob, 
* keep Flair, divided by 2, and a reduced weight of 1/3rd, compared to Vader
* and Vader as is. Weighted 2/3rds.

In [139]:
df_out.sort_values(['combined'], ascending=True).head(10)

Unnamed: 0,song_index,flair,vader,textblob,combined
42,benee/supalonely,-0.270244,-0.197696,-0.112935,-0.193625
35,juicewrld/stayhigh,-0.352154,-0.122454,-0.075163,-0.183257
22,megantheestallion/savage,-0.152735,-0.259691,-0.104821,-0.172416
30,juicewrld/badenergy,-0.212864,-0.168215,-0.118152,-0.16641
79,nlechoppa/walkemdown,-0.414123,-0.038751,-0.009133,-0.154002
12,dualipa/dontstartnow,-0.308806,-0.088127,-0.003441,-0.133458
66,popsmoke/makeitrain,-0.253585,-0.082403,-0.027511,-0.121166
69,dixiedamelio/behappy,-0.188794,-0.141481,-0.015749,-0.115341
73,kidlaroi/go,-0.252404,-0.072146,-0.014738,-0.113096
20,powfu/deathbed,-0.412763,0.071025,0.014057,-0.109227


In [161]:
df_out.loc[
    (df_out['flair'] < 0) & 
    (df_out['vader'] > 0)
]

Unnamed: 0,song_index,flair,vader,textblob,combined
3,harrystyles/watermelonsugar,-0.254719,0.062711,0.064287,-0.042574
13,dualipa/breakmyheart,-0.016461,0.171824,0.044444,0.066602
15,topic/breakingme,-0.018072,0.059889,0.007032,0.016283
20,powfu/deathbed,-0.412763,0.071025,0.014057,-0.109227
23,postmalone/circles,-0.178851,0.045949,0.001971,-0.043644
49,kanebrown/belikethat,-0.057339,0.094489,-0.019697,0.005818
55,diplo/dancewithme,-0.129197,0.082108,0.016787,-0.010101
64,dontoliver/afterparty,-0.014561,0.360363,0.114078,0.153294


In [162]:
df_out.loc[
    (df_out['flair'] > 0) & 
    (df_out['vader'] < 0)
]

Unnamed: 0,song_index,flair,vader,textblob,combined
0,saintjhn/roses,0.04262,-0.056735,0.001635,-0.00416
4,jackharlow/whatspoppin,0.139775,-0.03298,-0.020325,0.028823
9,juicewrld/hatetheotherside,0.068641,-0.175021,-0.021653,-0.042677
19,popsmoke/forthenight,0.477935,-0.109602,-0.003272,0.121687
25,juicewrld/titanic,0.145246,-0.139022,-0.015903,-0.003227
26,drake/toosieslide,0.160004,-0.023633,-0.021527,0.038281
27,juicewrld/bloodonmyjeans,0.104744,-0.006881,0.006187,0.034683
28,lewiscapaldi/beforeyougo,0.041996,-0.066602,-0.07061,-0.031739
31,popsmoke/gotitonme,0.044951,-0.015293,-0.042583,-0.004308
32,lilmosey/blueberryfaygo,0.162302,-0.247553,-0.026474,-0.037241


In [163]:
df_out.loc[
    (df_out['textblob'] > 0) & 
    (df_out['vader'] < 0)
]

Unnamed: 0,song_index,flair,vader,textblob,combined
0,saintjhn/roses,0.04262,-0.056735,0.001635,-0.00416
10,juicewrld/conversations,-0.192305,-0.036585,0.004675,-0.074738
27,juicewrld/bloodonmyjeans,0.104744,-0.006881,0.006187,0.034683
34,jpsaxe/iftheworldwasending,0.261627,-0.018404,0.028602,0.090608
37,regard/rideit,0.52323,-0.050662,0.011255,0.161274
41,juicewrld/upupandaway,0.35131,-0.071492,0.003674,0.094498
44,juicewrld/screwjuice,0.230528,-0.105505,0.006542,0.043855
78,scotthelman/waitnomore,-0.148208,-0.03806,0.026997,-0.05309


In [164]:
df_out.loc[
    (df_out['textblob'] < 0) & 
    (df_out['vader'] > 0)
]

Unnamed: 0,song_index,flair,vader,textblob,combined
16,kidcudi/theadventuresofmoonmanslimshady,0.100686,0.002522,-0.014298,0.029637
21,marshmello/bekind,0.423997,0.022508,-0.015556,0.143649
33,lukebryan/onemargarita,0.31255,0.045896,-0.000583,0.119288
45,juicewrld/manoftheyear,0.477005,0.057243,-0.005455,0.176264
49,kanebrown/belikethat,-0.057339,0.094489,-0.019697,0.005818
51,curtiswaters/stunnin,0.103267,0.0877,-0.005197,0.061923
52,lilbaby/wepaid,0.110093,0.029386,-0.003067,0.045471
57,chrisbrown/gocrazy,0.682925,0.020357,-0.018803,0.22816
60,jonasbrothers/x,0.596392,0.192665,-0.000761,0.262766
82,popsmoke/yeayea,0.39032,0.059595,-0.010271,0.146548


## Test querying the test_data

In [3]:
import sqlite3
con = sqlite3.connect("test_data/sentiment_compare.db")
cur = con.cursor()
cur.execute(f"SELECT song_index FROM sentiment_compare;") #WHERE song_index = {song_index}


In [17]:
song_index = 'saintjhn/roses'
cur.execute(f"SELECT song_index FROM sentiment_compare WHERE song_index = '{song_index}';")

<sqlite3.Cursor at 0x7fc5806d78f0>

In [18]:
if list(cur):
    print('yes')

yes


In [20]:
bool([])

False