In [20]:
"""

VADER ( Valence Aware Dictionary for Sentiment Reasoning)
is a model used for text sentiment analysis that is sensitive to
both polarity (positive/ negative) and intensity(strength) of emotion

Primarily , VADER sentiment analysis relies on a dictionary which maps
lexical features to emotion insensities called sentiment scores.

The sentiment score of a text can be obtained by summing up the intensity
of each word in the text.

For example, words like "love" , "like" , "enjoy" , "happy" all convey
a positive sentiment.

VADER is intelligent enough to understand basic context of these words,
such as "did not love" as a negative sentiment.

It also understand capitalization and punctuation such as "LOVE!!!"

Sentiment Analysis on raw text is always challenging however, due to
a variety of possible factors:
Positive and Negative sentiment in the same text data.

Sarcasm using positive words in a negative way.

"""



'\n\nVADER ( Valence Aware Dictionary for Sentiment Reasoning)\nis a model used for text sentiment analysis that is sensitive to \nboth polarity (positive/ negative) and intensity(strength) of emotion\n\nPrimarily , VADER sentiment analysis relies on a dictionary which maps\nlexical features to emotion insensities called sentiment scores.\n\nThe sentiment score of a text can be obtained by summing up the intensity\nof each word in the text.\n\nFor example, words like "love" , "like" , "enjoy" , "happy" all convey\na positive sentiment.\n\nVADER is intelligent enough to understand basic context of these words,\nsuch as "did not love" as a negative sentiment.\n\nIt also understand capitalization and punctuation such as "LOVE!!!"\n\nSentiment Analysis on raw text is always challenging however, due to\na variety of possible factors:\nPositive and Negative sentiment in the same text data.\n\nSarcasm using positive words in a negative way.\n\n'

In [21]:
import nltk

In [22]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [23]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [24]:

sid = SentimentIntensityAnalyzer()
sid


<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x7cc4a9bb8970>

In [25]:
# Vader Sentiment Intensity Analyzer does is it simply takes in the
# string and returns a dictionary of scores in 4 categories.

# Negative, neutral, positive, and then a compound score which is
# computed by normalizing the negative, neutral and positive scores.



In [26]:
a = "This is a good movie"

sid.polarity_scores(a)



{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [27]:
b="I can't find my keys anywhere."

sid.polarity_scores(b)


{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [28]:
c = "I love the new design of the website, but it's really slow to load."

sid.polarity_scores(c)


{'neg': 0.0, 'neu': 0.822, 'pos': 0.178, 'compound': 0.3818}

In [29]:
d = "I enjoy going for morning runs, but the weather has been too cold lately."

sid.polarity_scores(d)


{'neg': 0.0, 'neu': 0.851, 'pos': 0.149, 'compound': 0.2732}

In [30]:
# Compound score above 0 indicates positive score,
# Compound score below 0 indicates negative score.



In [31]:
import pandas as pd


In [32]:
data =  pd.read_csv("/content/amazonreviews.tsv",sep="\t")
data.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [33]:
data['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
neg,5097
pos,4903


In [34]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   10000 non-null  object
 1   review  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [35]:
data.isna().sum()

Unnamed: 0,0
label,0
review,0


In [37]:
data.dropna(inplace=True)

In [38]:
# drop empty white space values if any

blanks=[]
for i,lb,rv in data.itertuples():

    # index, label, review
    if type(rv) == str:
        if rv.isspace():
            blanks.append(i)
blanks


[]

In [39]:
data.drop(blanks,inplace=True)

In [44]:
data.iloc[0]['review']

#data['review'][0]

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [41]:
sid.polarity_scores(data.iloc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [45]:

# add new column
data['scores']= data['review'].apply(lambda review: sid.polarity_scores(review))
data.head()


Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co..."
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co..."
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com..."
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com..."
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp..."


In [47]:
# new column compound
data['compound']= data['scores'].apply(lambda d:d['compound'])
data.head()

Unnamed: 0,label,review,scores,compound
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781


In [50]:

data['comp_score']= data['compound'].apply(lambda score:"pos" if score>=0 else "neg")

data.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [51]:
data.tail()

Unnamed: 0,label,review,scores,compound,comp_score
9995,pos,A revelation of life in small town America in ...,"{'neg': 0.017, 'neu': 0.846, 'pos': 0.136, 'co...",0.961,pos
9996,pos,Great biography of a very interesting journali...,"{'neg': 0.0, 'neu': 0.868, 'pos': 0.132, 'comp...",0.9544,pos
9997,neg,Interesting Subject; Poor Presentation: You'd ...,"{'neg': 0.084, 'neu': 0.754, 'pos': 0.162, 'co...",0.9102,pos
9998,neg,Don't buy: The box looked used and it is obvio...,"{'neg': 0.091, 'neu': 0.909, 'pos': 0.0, 'comp...",-0.3595,neg
9999,pos,Beautiful Pen and Fast Delivery.: The pen was ...,"{'neg': 0.028, 'neu': 0.811, 'pos': 0.161, 'co...",0.9107,pos


In [52]:
from sklearn.metrics import confusion_matrix,classification_report


In [54]:
confusion_matrix(data['label'],data['comp_score'])

array([[2629, 2468],
       [ 435, 4468]])

In [58]:
report=classification_report(data['label'],data['comp_score'],output_dict=True)
report

{'neg': {'precision': 0.8580287206266318,
  'recall': 0.5157936040808319,
  'f1-score': 0.6442837887513785,
  'support': 5097.0},
 'pos': {'precision': 0.6441753171856978,
  'recall': 0.9112788088925148,
  'f1-score': 0.7547934791789848,
  'support': 4903.0},
 'accuracy': 0.7097,
 'macro avg': {'precision': 0.7511020189061648,
  'recall': 0.7135362064866733,
  'f1-score': 0.6995386339651817,
  'support': 10000.0},
 'weighted avg': {'precision': 0.7531763969195419,
  'recall': 0.7097,
  'f1-score': 0.6984666899680338,
  'support': 10000.0}}

In [60]:
report= pd.DataFrame(report).transpose()
report

Unnamed: 0,neg,pos,accuracy,macro avg,weighted avg
precision,0.858029,0.644175,0.7097,0.751102,0.753176
recall,0.515794,0.911279,0.7097,0.713536,0.7097
f1-score,0.644284,0.754793,0.7097,0.699539,0.698467
support,5097.0,4903.0,0.7097,10000.0,10000.0
