# Part 4: Sentiment Analysis

### Create Sentiment Analysis Scores for each Individual Review

TextBlob polarity and subjectivity, and Vader compound score.

In [2]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.3/624.3 kB[0m [31m28.0 MB/s[0m  [33m0:00:00[0m
[?25hInstalling collected packages: textblob
Successfully installed textblob-0.19.0


In [5]:
# Imports and Helper functions

from datetime import datetime
import time
import requests
import pickle
from pathlib import Path
import re
import random

import pandas as pd

In [3]:
# imports for SA
from textblob import TextBlob
from textblob import Word

## Prep the Data

In [49]:
## load the cleaned reviews
review_df = pd.read_csv(f'data/review_tokenized.csv')
review_df['recommendationid'] = review_df['recommendationid'].astype("str")
# review_df.head()
print(review_df.shape)

(48662, 3)


In [50]:
raw_review_df = pd.read_csv(f'data/raw_reviews.csv')[['recommendationid', 'review_text']]
raw_review_df['recommendationid'] = raw_review_df['recommendationid'].astype("str")
raw_review_df.head()
# print(raw_review_df.shape)

Unnamed: 0,recommendationid,review_text
0,212664845,Addictive. Stressful. Time waster.
1,212664820,fuak arc\r\n
2,212664759,W
3,212664729,awesome game!
4,212664705,If the Steam comments section is like every ot...


In [51]:
## we actually want the raw text, so we can tokenize it (and not lemmatize it)
## but we've filtered out some results due to text length, so we join the raw_reviews to the cleaned_reviews
## so we only do SA on the reviews we've included in the corpus

review_token_df = pd.merge(review_df, raw_review_df, on="recommendationid")
review_token_df.head()

Unnamed: 0,recommendationid,Appname,review_cleaned,review_text
0,212664845,ARC Raiders,addictive stressful time waster,Addictive. Stressful. Time waster.
1,212664705,ARC Raiders,steam comment section like every comment secti...,If the Steam comments section is like every ot...
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,I like the gathering and sneaking around the A...
3,212664560,ARC Raiders,well make game every time hop experience somet...,"Very well made game, every time I hop on I exp..."
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...,I thought this would be too sweaty for me. Hon...


In [52]:
from nltk.tokenize import wordpunct_tokenize
from nltk.tokenize import word_tokenize

In [53]:
# tokenize each review
review_token_df['review_tokens'] = review_token_df['review_text'].apply(wordpunct_tokenize)
review_token_df['review_tokens']

0            [Addictive, ., Stressful, ., Time, waster, .]
1        [If, the, Steam, comments, section, is, like, ...
2        [I, like, the, gathering, and, sneaking, aroun...
3        [Very, well, made, game, ,, every, time, I, ho...
4        [I, thought, this, would, be, too, sweaty, for...
                               ...                        
48657                       [We, have, Hades, at, home, :]
48658    [Unfortunately, can, not, recommend, the, game...
48659    [if, you, enjoy, soulstone, survivors, or, had...
48660    [Great, game, overall, !, I, really, enjoy, th...
48661    [The, game, is, pretty, good, for, some, of, t...
Name: review_tokens, Length: 48662, dtype: object

In [56]:
# create a text column with tagged tokens
review_token_df['review_string'] = review_token_df['review_tokens'].apply (lambda row: ' '.join(str(x) for x in row))
# check the final text
review_token_df.head()

Unnamed: 0,recommendationid,Appname,review_cleaned,review_text,review_tokens,review_string
0,212664845,ARC Raiders,addictive stressful time waster,Addictive. Stressful. Time waster.,"[Addictive, ., Stressful, ., Time, waster, .]",Addictive . Stressful . Time waster .
1,212664705,ARC Raiders,steam comment section like every comment secti...,If the Steam comments section is like every ot...,"[If, the, Steam, comments, section, is, like, ...",If the Steam comments section is like every ot...
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,I like the gathering and sneaking around the A...,"[I, like, the, gathering, and, sneaking, aroun...",I like the gathering and sneaking around the A...
3,212664560,ARC Raiders,well make game every time hop experience somet...,"Very well made game, every time I hop on I exp...","[Very, well, made, game, ,, every, time, I, ho...","Very well made game , every time I hop on I ex..."
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...,I thought this would be too sweaty for me. Hon...,"[I, thought, this, would, be, too, sweaty, for...",I thought this would be too sweaty for me . Ho...


##### Add the polarity and subjectivity measures from TextBlob

In [57]:
# calculate polarity
review_token_df['polarity'] = review_token_df['review_string'].apply(
(lambda row: TextBlob (row).sentiment[0]))

# calculate subjectivity
review_token_df['subjectivity'] = review_token_df['review_string'].apply(
(lambda row: TextBlob (row).sentiment[1])
)

review_token_df.head()

## I can see small differences in polarity and subjectivity, but overall tokenizing isn't drastically changing sentiment analysis results

Unnamed: 0,recommendationid,Appname,review_cleaned,review_text,review_tokens,review_string,polarity,subjectivity
0,212664845,ARC Raiders,addictive stressful time waster,Addictive. Stressful. Time waster.,"[Addictive, ., Stressful, ., Time, waster, .]",Addictive . Stressful . Time waster .,0.0,0.9
1,212664705,ARC Raiders,steam comment section like every comment secti...,If the Steam comments section is like every ot...,"[If, the, Steam, comments, section, is, like, ...",If the Steam comments section is like every ot...,0.030422,0.493232
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,I like the gathering and sneaking around the A...,"[I, like, the, gathering, and, sneaking, aroun...",I like the gathering and sneaking around the A...,-0.275,0.25
3,212664560,ARC Raiders,well make game every time hop experience somet...,"Very well made game, every time I hop on I exp...","[Very, well, made, game, ,, every, time, I, ho...","Very well made game , every time I hop on I ex...",-0.021212,0.384848
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...,I thought this would be too sweaty for me. Hon...,"[I, thought, this, would, be, too, sweaty, for...",I thought this would be too sweaty for me . Ho...,0.252083,0.510417


### Add the compound score from NLTK

In [58]:
## imports for SA with NLTK
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/joh11678/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [59]:
# Calculate NLTK compound score tokenized
review_token_df['NLTK_Compound'] = review_token_df['review_string'].apply(
(lambda row: sia.polarity_scores(row)['compound'])
)

review_token_df.head()

## I tried with both the raw text and the tokenized text. Im going to use the tokenized results because I know they aren't getting tripped up by the \t characters

Unnamed: 0,recommendationid,Appname,review_cleaned,review_text,review_tokens,review_string,polarity,subjectivity,NLTK_Compound
0,212664845,ARC Raiders,addictive stressful time waster,Addictive. Stressful. Time waster.,"[Addictive, ., Stressful, ., Time, waster, .]",Addictive . Stressful . Time waster .,0.0,0.9,-0.5106
1,212664705,ARC Raiders,steam comment section like every comment secti...,If the Steam comments section is like every ot...,"[If, the, Steam, comments, section, is, like, ...",If the Steam comments section is like every ot...,0.030422,0.493232,-0.8888
2,212664692,ARC Raiders,like gathering sneak around arc pvp part peopl...,I like the gathering and sneaking around the A...,"[I, like, the, gathering, and, sneaking, aroun...",I like the gathering and sneaking around the A...,-0.275,0.25,-0.2498
3,212664560,ARC Raiders,well make game every time hop experience somet...,"Very well made game, every time I hop on I exp...","[Very, well, made, game, ,, every, time, I, ho...","Very well made game , every time I hop on I ex...",-0.021212,0.384848,0.3384
4,212664471,ARC Raiders,think would sweaty honestly somehow stop playi...,I thought this would be too sweaty for me. Hon...,"[I, thought, this, would, be, too, sweaty, for...",I thought this would be too sweaty for me . Ho...,0.252083,0.510417,0.4404


### Save output and display

In [60]:
## write sa results
review_token_df[['recommendationid', 'Appname', 'polarity', 'subjectivity', 'NLTK_Compound']].to_csv(f'data/review_sa.csv', index=False)

In [61]:
## load sa results
review_token_df = pd.read_csv(f'data/review_sa.csv')
review_token_df.head()

Unnamed: 0,recommendationid,Appname,polarity,subjectivity,NLTK_Compound
0,212664845,ARC Raiders,0.0,0.9,-0.5106
1,212664705,ARC Raiders,0.030422,0.493232,-0.8888
2,212664692,ARC Raiders,-0.275,0.25,-0.2498
3,212664560,ARC Raiders,-0.021212,0.384848,0.3384
4,212664471,ARC Raiders,0.252083,0.510417,0.4404
