In [1]:
# VADER (Valence Aware Dictionary for Sentiment Reasoning) is a model used for text sentiment analysis that is 
# sensitive to both polarities (positive/negative), the neutral texts and intensity (strength) of the emotion.

# - Vader Model is available in the NLTK package and it can be applied to unlabeled text data. 


# Vader Sentiment Analysis primarily relies on a dictionary which maps lexical features to emotion intensities
# called sentiment scores.

# The sentiment score of a text can be obtained by summing up the intensity of each word in the text.


# Document Sentiment Score: We grab all words in a document. Then, we convert each word to a positive or 
# negative value. Finally, we sum up all values we find. At the end, we will find the document sentiment 
# score.

# For instance, the words such as 'love', 'like', 'enjoy', and 'happy' all convey a positive sentiment.

# VADER is intelligent enough to understand basic contexts of these words, such as 'did not love' as a 
# negative sentiment. It also understands punctuation and capitalization, like "LOVE!!!"

# VADER is also smart enough to understand that the word 'LOVE!!!!!' conveys more positive information than 
# the word 'love'.

# Sentiment Analysis on raw text is always challenging due to a variety of possible factors: 

# 1-) Positive and Negative Sentiment in the same text data
# 2-) Using positive words in a negative way (iğnelemek)

In [2]:
# necessary imports 
import nltk

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/barissss/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer


# 'Vader Sentiment Intensity Analyzer' takes a string and returns a dictionary of scores in four categories:
# 1-) Negative score
# 2-) Neutral score 
# 3-) Positive score 
# 4-) Compound score 

# Compound score is computed by normalizing the negative, neutral, and positive scores.

sentiment_intensity_analyzer = SentimentIntensityAnalyzer()
print(sentiment_intensity_analyzer)
print(type(sentiment_intensity_analyzer))

<nltk.sentiment.vader.SentimentIntensityAnalyzer object at 0x7ff1f99cff70>
<class 'nltk.sentiment.vader.SentimentIntensityAnalyzer'>


In [5]:
string = "This is a good movie."
polarity_scores = sentiment_intensity_analyzer.polarity_scores(string)
print(polarity_scores)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}


In [6]:
neg_score = polarity_scores['neg']
pos_score = polarity_scores['pos']
neu_score = polarity_scores['neu']
comp_score = polarity_scores['compound']

print("The sentiment scores for the text named 'string' is as below: ")
print()
print("The negative score: "+str(neg_score)+"")
print("The positive score: "+str(pos_score)+"")
print("The neutral score:  "+str(neu_score)+"")
print("The compound score: "+str(comp_score)+"")

The sentiment scores for the text named 'string' is as below: 

The negative score: 0.0
The positive score: 0.492
The neutral score:  0.508
The compound score: 0.4404


In [7]:
my_string = "This is the WORST movie that has ever disgraced the screen."
polarity_scores2 = sentiment_intensity_analyzer.polarity_scores(my_string)
print(polarity_scores2)

{'neg': 0.465, 'neu': 0.535, 'pos': 0.0, 'compound': -0.8331}


In [8]:
negative_score = polarity_scores['neg']
positive_score = polarity_scores['pos']
neutral_score = polarity_scores['neu']
compound_score = polarity_scores['compound']

print("The sentiment scores for the text named 'my_string' is as below: ")
print()
print("The negative score: "+str(neg_score)+"")
print("The positive score: "+str(pos_score)+"")
print("The neutral score: "+str(neu_score)+"")
print("The compound score: "+str(comp_score)+"")

# When the compound score is 0, then it means that the text is neutral. When the compound score is smaller than 
# zero, then it means that the text is negative. When the compound score is greater than zero, then it means that 
# the text is positive. 

The sentiment scores for the text named 'my_string' is as below: 

The negative score: 0.0
The positive score: 0.492
The neutral score: 0.508
The compound score: 0.4404


In [9]:
import pandas as pd

# reading the tsv file called 'amazonreviews.tsv'
amazon_reviews_df = pd.read_csv('amazonreviews.tsv', sep='\t')

print("The Amazon reviews data frame is as below:")
print()
print(amazon_reviews_df)

The Amazon reviews data frame is as below:

     label                                             review
0      pos  Stuning even for the non-gamer: This sound tra...
1      pos  The best soundtrack ever to anything.: I'm rea...
2      pos  Amazing!: This soundtrack is my favorite music...
3      pos  Excellent Soundtrack: I truly like this soundt...
4      pos  Remember, Pull Your Jaw Off The Floor After He...
...    ...                                                ...
9995   pos  A revelation of life in small town America in ...
9996   pos  Great biography of a very interesting journali...
9997   neg  Interesting Subject; Poor Presentation: You'd ...
9998   neg  Don't buy: The box looked used and it is obvio...
9999   pos  Beautiful Pen and Fast Delivery.: The pen was ...

[10000 rows x 2 columns]


In [10]:
# Displaying the head, in other words the first 5 records, of the Amazon reviews data frame
amazon_reviews_df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [11]:
print("The first 5 records (head) of the Amazon reviews data frame: ")
print()
print(amazon_reviews_df.head())

The first 5 records (head) of the Amazon reviews data frame: 

  label                                             review
0   pos  Stuning even for the non-gamer: This sound tra...
1   pos  The best soundtrack ever to anything.: I'm rea...
2   pos  Amazing!: This soundtrack is my favorite music...
3   pos  Excellent Soundtrack: I truly like this soundt...
4   pos  Remember, Pull Your Jaw Off The Floor After He...


In [12]:
# Displaying the tail, in other words the last 5 records, of the Amazon reviews data frame
amazon_reviews_df.tail()

Unnamed: 0,label,review
9995,pos,A revelation of life in small town America in ...
9996,pos,Great biography of a very interesting journali...
9997,neg,Interesting Subject; Poor Presentation: You'd ...
9998,neg,Don't buy: The box looked used and it is obvio...
9999,pos,Beautiful Pen and Fast Delivery.: The pen was ...


In [13]:
print("The last 5 records (tail) of the Amazon reviews data frame: ")
print()
print(amazon_reviews_df.tail())

The last 5 records (tail) of the Amazon reviews data frame: 

     label                                             review
9995   pos  A revelation of life in small town America in ...
9996   pos  Great biography of a very interesting journali...
9997   neg  Interesting Subject; Poor Presentation: You'd ...
9998   neg  Don't buy: The box looked used and it is obvio...
9999   pos  Beautiful Pen and Fast Delivery.: The pen was ...


In [14]:
row_num, col_num = amazon_reviews_df.shape
print("There are "+str(row_num)+" rows and "+str(col_num)+" columns in the Amazon reviews data frame.")
print()
print("The total number of rows: "+str(row_num)+"")
print("The total number of columns: "+str(col_num)+"")

There are 10000 rows and 2 columns in the Amazon reviews data frame.

The total number of rows: 10000
The total number of columns: 2


In [15]:
print("The labels in the Amazon reviews data frame: ")
print()
print(amazon_reviews_df['label'])
print()
print()
print(amazon_reviews_df['label'].value_counts()) # this gets the number of 'neg' and 'pos' labels, meaning the 
# number of negative movies and the number of positive movies.

The labels in the Amazon reviews data frame: 

0       pos
1       pos
2       pos
3       pos
4       pos
       ... 
9995    pos
9996    pos
9997    neg
9998    neg
9999    pos
Name: label, Length: 10000, dtype: object


neg    5097
pos    4903
Name: label, dtype: int64


In [16]:
label_count = amazon_reviews_df['label'].value_counts()
negative_count = label_count['neg']
positive_count = label_count['pos']

negative_review_rate = negative_count / (negative_count + positive_count)
positive_review_rate = positive_count / (negative_count + positive_count)

print("The negative review rate in Amazon's review data frame: "+str(negative_review_rate)+"")
print("The positive review rate in Amazon's review data frame: "+str(positive_review_rate)+"")

The negative review rate in Amazon's review data frame: 0.5097
The positive review rate in Amazon's review data frame: 0.4903


In [17]:
print("The reviews in the Amazon reviews data frame: ")
print()
print(amazon_reviews_df['review'])

The reviews in the Amazon reviews data frame: 

0       Stuning even for the non-gamer: This sound tra...
1       The best soundtrack ever to anything.: I'm rea...
2       Amazing!: This soundtrack is my favorite music...
3       Excellent Soundtrack: I truly like this soundt...
4       Remember, Pull Your Jaw Off The Floor After He...
                              ...                        
9995    A revelation of life in small town America in ...
9996    Great biography of a very interesting journali...
9997    Interesting Subject; Poor Presentation: You'd ...
9998    Don't buy: The box looked used and it is obvio...
9999    Beautiful Pen and Fast Delivery.: The pen was ...
Name: review, Length: 10000, dtype: object


In [23]:
# checks the null values
amazon_reviews_df.isnull().sum() # to check whether a specific column of the data frame contains missing values.

label     0
review    0
dtype: int64

In [26]:
# If there were any nan values, this will be used to drop them from the 
# amazon reviews data frame.
amazon_reviews_df.dropna(inplace=True) 

In [27]:
blanks = list()
for index, label, review in amazon_reviews_df.itertuples():
    if type(review) == str:
        if review.isspace():
            blanks.append(index)

print(blanks)

[]


In [34]:
amazon_reviews_df.iloc[0]['review'] # this grabs the text of the first review

'Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [35]:
print(amazon_reviews_df.iloc[0]['review'])

Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


In [33]:
print(amazon_reviews_df.iloc[0])
print()
print()
print(amazon_reviews_df.iloc[0]['review'])
print()
print()
print(sentiment_intensity_analyzer.polarity_scores(amazon_reviews_df.iloc[0]['review']))

label                                                   pos
review    Stuning even for the non-gamer: This sound tra...
Name: 0, dtype: object


Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^


{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}


In [36]:
# This adds a 'scores' column, containing the sentiment scores of each review, to the amazon reviews data frame.
amazon_reviews_df['scores'] = amazon_reviews_df['review'].apply(lambda review: sentiment_intensity_analyzer.polarity_scores(review))

In [37]:
print(amazon_reviews_df.head())

  label                                             review  \
0   pos  Stuning even for the non-gamer: This sound tra...   
1   pos  The best soundtrack ever to anything.: I'm rea...   
2   pos  Amazing!: This soundtrack is my favorite music...   
3   pos  Excellent Soundtrack: I truly like this soundt...   
4   pos  Remember, Pull Your Jaw Off The Floor After He...   

                                              scores  
0  {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...  
1  {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...  
2  {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...  
3  {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...  
4  {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...  


In [38]:
print(amazon_reviews_df.tail())

     label                                             review  \
9995   pos  A revelation of life in small town America in ...   
9996   pos  Great biography of a very interesting journali...   
9997   neg  Interesting Subject; Poor Presentation: You'd ...   
9998   neg  Don't buy: The box looked used and it is obvio...   
9999   pos  Beautiful Pen and Fast Delivery.: The pen was ...   

                                                 scores  
9995  {'neg': 0.017, 'neu': 0.846, 'pos': 0.136, 'co...  
9996  {'neg': 0.0, 'neu': 0.868, 'pos': 0.132, 'comp...  
9997  {'neg': 0.084, 'neu': 0.754, 'pos': 0.162, 'co...  
9998  {'neg': 0.091, 'neu': 0.909, 'pos': 0.0, 'comp...  
9999  {'neg': 0.028, 'neu': 0.811, 'pos': 0.161, 'co...  


In [39]:
amazon_reviews_df['compound'] = amazon_reviews_df['scores'].apply(lambda d: d['compound'])

In [40]:
print(amazon_reviews_df.head())

  label                                             review  \
0   pos  Stuning even for the non-gamer: This sound tra...   
1   pos  The best soundtrack ever to anything.: I'm rea...   
2   pos  Amazing!: This soundtrack is my favorite music...   
3   pos  Excellent Soundtrack: I truly like this soundt...   
4   pos  Remember, Pull Your Jaw Off The Floor After He...   

                                              scores  compound  
0  {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...    0.9454  
1  {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...    0.8957  
2  {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...    0.9858  
3  {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...    0.9814  
4  {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...    0.9781  


In [42]:
amazon_reviews_df['negative'] = amazon_reviews_df['scores'].apply(lambda d: d['neg'])

In [43]:
amazon_reviews_df['neutral'] = amazon_reviews_df['scores'].apply(lambda d: d['neu'])
amazon_reviews_df['positive'] = amazon_reviews_df['scores'].apply(lambda d: d['pos'])

In [44]:
print(amazon_reviews_df.head())

  label                                             review  \
0   pos  Stuning even for the non-gamer: This sound tra...   
1   pos  The best soundtrack ever to anything.: I'm rea...   
2   pos  Amazing!: This soundtrack is my favorite music...   
3   pos  Excellent Soundtrack: I truly like this soundt...   
4   pos  Remember, Pull Your Jaw Off The Floor After He...   

                                              scores  compound  negative  \
0  {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...    0.9454     0.088   
1  {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...    0.8957     0.018   
2  {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...    0.9858     0.040   
3  {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...    0.9814     0.090   
4  {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...    0.9781     0.000   

   neutral  positive  
0    0.669     0.243  
1    0.837     0.145  
2    0.692     0.268  
3    0.615     0.295  
4    0.746     0.254  


In [49]:
# This applies the below prediction on the reviews:
# It says that a review is negative if the compound score of this review is smaller than 0. 
# If the compound score of the review is greater than or equal to 0, the review is positive.
amazon_reviews_df['sentiment_result'] = amazon_reviews_df['compound'].apply(lambda score: "neg" if score < 0 else "pos")

In [50]:
print(amazon_reviews_df.head())

  label                                             review  \
0   pos  Stuning even for the non-gamer: This sound tra...   
1   pos  The best soundtrack ever to anything.: I'm rea...   
2   pos  Amazing!: This soundtrack is my favorite music...   
3   pos  Excellent Soundtrack: I truly like this soundt...   
4   pos  Remember, Pull Your Jaw Off The Floor After He...   

                                              scores  compound  negative  \
0  {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...    0.9454     0.088   
1  {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...    0.8957     0.018   
2  {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...    0.9858     0.040   
3  {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...    0.9814     0.090   
4  {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...    0.9781     0.000   

   neutral  positive sent_result sentiment_result  
0    0.669     0.243         pos              pos  
1    0.837     0.145         pos              pos  
2    0.692    

In [51]:
amazon_reviews_df.drop('sent_result', axis=1, inplace=True)

In [52]:
print(amazon_reviews_df.head())

  label                                             review  \
0   pos  Stuning even for the non-gamer: This sound tra...   
1   pos  The best soundtrack ever to anything.: I'm rea...   
2   pos  Amazing!: This soundtrack is my favorite music...   
3   pos  Excellent Soundtrack: I truly like this soundt...   
4   pos  Remember, Pull Your Jaw Off The Floor After He...   

                                              scores  compound  negative  \
0  {'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...    0.9454     0.088   
1  {'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...    0.8957     0.018   
2  {'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...    0.9858     0.040   
3  {'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...    0.9814     0.090   
4  {'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...    0.9781     0.000   

   neutral  positive sentiment_result  
0    0.669     0.243              pos  
1    0.837     0.145              pos  
2    0.692     0.268              pos  
3    0.615

In [58]:
# Performance evaluation for the custom prediction made on reviews

# necessary imports
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [59]:
# accuracy score calculation
accuracy = accuracy_score(amazon_reviews_df['label'], amazon_reviews_df['sentiment_result'])
print(accuracy)

0.7097


In [60]:
# classification report 
classification_report = classification_report(amazon_reviews_df['label'], amazon_reviews_df['sentiment_result'])
print(classification_report)

              precision    recall  f1-score   support

         neg       0.86      0.52      0.64      5097
         pos       0.64      0.91      0.75      4903

    accuracy                           0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [61]:
# confusion matrix
confusion_matrix = confusion_matrix(amazon_reviews_df['label'], amazon_reviews_df['sentiment_result'])
print(confusion_matrix)

[[2629 2468]
 [ 435 4468]]
