In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()

In [2]:
sc = spark.sparkContext 
sc

In [20]:
### Ignore all the warnings of following cells
import warnings
warnings.filterwarnings('ignore')

In [3]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [4]:
data = spark.read.json("s3://502-project/amazon_game_data")

In [5]:
### Drop unnecessary columns in Amazon game datasets
drop_list = ['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime']
data = data.select([column for column in data.columns if column not in drop_list])

# format text column
data.createOrReplaceTempView("data")
data=spark.sql("SELECT overall, LOWER(reviewText) AS reviewText FROM data")
data.show(5)
data.printSchema()

+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    5.0|this is a old cla...|
|    4.0|this game is more...|
|    5.0|if you love wwf n...|
|    4.0|i had wwf wrestle...|
|    4.0|i have to admit i...|
+-------+--------------------+
only showing top 5 rows

root
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)



In [6]:
import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
import nltk
from nltk.corpus import stopwords
stopwords_lst=stopwords.words('english')+['1','2','3','4','5','6','7','8','9','0',]

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
data.createOrReplaceTempView('data')

In [8]:
from pyspark.sql import functions as F
df=data.withColumn('attitude', F.when(F.col('overall')<3,'negative').otherwise(F.when( F.col('overall') == 3,'neutral').otherwise('positive')))

In [9]:
### Convert to pandas for convenient
dff = df.toPandas()



pandas.core.frame.DataFrame

In [10]:
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter

In [12]:
### current dataframe shape
dff.shape

(1075312, 3)

In [14]:
### install vaderSentiment package
pip install vaderSentiment

Note: you may need to restart the kernel to use updated packages.


In [15]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import numpy as np

In [16]:
### create new column and wait for future fill in score
dff['score'] = np.nan
dff.head(5)

Unnamed: 0,overall,reviewText,attitude,score
0,5.0,this is a old classic wrestling game. you can...,positive,
1,4.0,this game is more of a one-on-one fighter (a' ...,positive,
2,5.0,if you love wwf now its called wwe this is a g...,positive,
3,4.0,i had wwf wrestlemania: the arcde game years a...,positive,
4,4.0,i have to admit i hadn't started watching wres...,positive,


In [17]:
### create new column and wait for future fill in vader result
dff['vaderResult'] = np.nan
dff.head(5)

Unnamed: 0,overall,reviewText,attitude,score,vaderResult
0,5.0,this is a old classic wrestling game. you can...,positive,,
1,4.0,this game is more of a one-on-one fighter (a' ...,positive,,
2,5.0,if you love wwf now its called wwe this is a g...,positive,,
3,4.0,i had wwf wrestlemania: the arcde game years a...,positive,,
4,4.0,i have to admit i hadn't started watching wres...,positive,,


In [18]:
### apply vader package
vader = SentimentIntensityAnalyzer()

### choose certain threshold to divide score into three kinds of vader result
for i in range(len(dff['reviewText'])):
    vv = vader.polarity_scores(dff['reviewText'][i])['compound']
    dff['score'][i] = vv
    if i % 10000 == 0:
        print(i)
    if vv >= 0.5:
        dff['vaderResult'][i] = 'positive'
    elif vv <= -0.5:
        dff['vaderResult'][i] = 'negative'
    else:
        dff['vaderResult'][i] = 'neutral'
    
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [24]:
### Pick certain threshold on VADER score to assign vaderresult

for i in range(len(dff['score'])):
    if dff['score'][i] >= 0.5:
        dff['vaderResult'][i] = 'positive'
    elif dff['score'][i] <= -0.5:
        dff['vaderResult'][i] = 'negative'
    else:
        dff['vaderResult'][i] = 'neutral'

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000
390000
400000
410000
420000
430000
440000
450000
460000
470000
480000
490000
500000
510000
520000
530000
540000
550000
560000
570000
580000
590000
600000
610000
620000
630000
640000
650000
660000
670000
680000
690000
700000
710000
720000
730000
740000
750000
760000
770000
780000
790000
800000
810000
820000
830000
840000
850000
860000
870000
880000
890000
900000
910000
920000
930000
940000
950000
960000
970000
980000
990000
1000000
1010000
1020000
1030000
1040000
1050000
1060000
1070000


In [25]:
dff.head(100)

Unnamed: 0,overall,reviewText,attitude,score,vaderResult
0,5.0,this is a old classic wrestling game. you can...,positive,0.0000,neutral
1,4.0,this game is more of a one-on-one fighter (a' ...,positive,0.2942,neutral
2,5.0,if you love wwf now its called wwe this is a g...,positive,0.8884,positive
3,4.0,i had wwf wrestlemania: the arcde game years a...,positive,0.9896,positive
4,4.0,i have to admit i hadn't started watching wres...,positive,0.8790,positive
...,...,...,...,...,...
95,5.0,"everquest is by far the most fun, most origina...",positive,0.9074,positive
96,5.0,this game is huge! there is so much to do and...,positive,0.8891,positive
97,5.0,"everquest is the best game there is, no questi...",positive,0.9943,positive
98,5.0,this game is almost god-like. if your are a r...,positive,-0.7410,negative


In [32]:
### figure out the wrong review 
corcount = 0
total = len(dff['score'])
for i in range(len(dff['score'])):
    if dff['vaderResult'][i] == dff['attitude'][i]:
        corcount += 1

In [34]:
### calculate the accuracy
accuracy = corcount / total