In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
spark = SparkSession.builder \
     .appName("SparkSession") \
     .getOrCreate()

sc = spark.sparkContext 
sc

In [2]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [3]:
data = spark.read.json("s3://502-project/amazon_game_data")
data.show(10)

+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|    reviewerID|        reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------+--------------------+--------------------+--------------+
|B00002STAU| [0, 0]|    5.0|this is a old cla...|07 30, 2012|A1G0VFQ9198IUF|                  al|           a classic|    1343606400|
|B00002STAU| [2, 2]|    4.0|This game is more...|02 21, 2001| AXUOVXIGF9CKC|      "bigdcaldavis"|  good fighting game|     982713600|
|B00002STAU| [0, 0]|    5.0|If you love WWF n...|11 14, 2011|A15JTJXQXO22JJ|           Chad Frey|WWF Wrestlemania ...|    1321228800|
|B00002STAU| [1, 1]|    4.0|I had WWF Wrestle...|08 10, 2008| ANRNG7OAARR70|D. Hensley "Horro...|wrestling game wi...|    1218326400|
|B00002STAU| [0, 0]|    4.0|I have to admit I...|07 24, 2009|A

# Data Processing

In [4]:
# remove all unnecessary columns and only keep predictor feature and label
drop_list = ['asin', 'helpful', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime']
data = data.select([column for column in data.columns if column not in drop_list])

# format text column
data.createOrReplaceTempView("data")
data=spark.sql("SELECT overall, LOWER(reviewText) AS reviewText FROM data")
data.show(5)
data.printSchema()

+-------+--------------------+
|overall|          reviewText|
+-------+--------------------+
|    5.0|this is a old cla...|
|    4.0|this game is more...|
|    5.0|if you love wwf n...|
|    4.0|i had wwf wrestle...|
|    4.0|i have to admit i...|
+-------+--------------------+
only showing top 5 rows

root
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)



In [5]:
# inspect data
from pyspark.sql.functions import col
data.groupBy("overall") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

data.groupBy("reviewText") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------+------+
|overall| count|
+-------+------+
|    5.0|594971|
|    4.0|210922|
|    1.0|112789|
|    3.0| 97153|
|    2.0| 59477|
+-------+------+

+------------+-----+
|  reviewText|count|
+------------+-----+
|            |  246|
|  great game|  100|
|        good|   88|
|       great|   85|
|   good game|   66|
|   excellent|   50|
|     love it|   44|
|     awesome|   36|
| works great|   36|
|   very good|   34|
| great game!|   29|
|   excelente|   28|
|         fun|   28|
|        nice|   27|
|          ok|   25|
|     perfect|   25|
|awesome game|   22|
|    excelent|   21|
|    fun game|   20|
| great game.|   17|
+------------+-----+
only showing top 20 rows



In [6]:
import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
import nltk
from nltk.corpus import stopwords
stopwords_lst=stopwords.words('english')+['1','2','3','4','5','6','7','8','9','0',]

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /home/hadoop/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")
# stop words
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(stopwords_lst)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=1)

In [8]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
label_stringIdx = StringIndexer(inputCol = "overall", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)
dataset.show(5)

+-------+--------------------+--------------------+--------------------+--------------------+-----+
|overall|          reviewText|               words|            filtered|            features|label|
+-------+--------------------+--------------------+--------------------+--------------------+-----+
|    5.0|this is a old cla...|[this, is, a, old...|[old, classic, wr...|(10000,[0,41,51,3...|  0.0|
|    4.0|this game is more...|[this, game, is, ...|[game, one, one, ...|(10000,[0,2,5,7,5...|  1.0|
|    5.0|if you love wwf n...|[if, you, love, w...|[love, wwf, calle...|(10000,[0,6,20,23...|  0.0|
|    4.0|i had wwf wrestle...|[i, had, wwf, wre...|[wwf, wrestlemani...|(10000,[0,1,2,3,4...|  1.0|
|    4.0|i have to admit i...|[i, have, to, adm...|[admit, started, ...|(10000,[0,1,3,4,7...|  1.0|
+-------+--------------------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



label to stringIdx: overall->label: 1->3, 2->4, 3->2, 4->1, 5->0

In [9]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 752381
Test Dataset Count: 322931


# NaiveBayes

In [10]:
# model trarining
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)

# run model on test data
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("reviewText","overall","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                    reviewText|overall|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|i originally wrote a review...|    5.0|[1.0,1.0880086880649125E-16...|  0.0|       0.0|
|cuando uno termina este jue...|    5.0|[1.0,1.0437689243605692E-16...|  0.0|       0.0|
|i recently bought this psp ...|    5.0|[1.0,9.254118904297825E-17,...|  0.0|       0.0|
|este es un juego obligatori...|    5.0|[1.0,8.404487436486103E-17,...|  0.0|       0.0|
|the playstation vita tends ...|    4.0|[1.0,7.939853558907832E-17,...|  1.0|       0.0|
|this video game system live...|    5.0|[1.0,7.544758352543615E-17,...|  0.0|       0.0|
|since nintendo orignally ca...|    5.0|[1.0,6.850183256187071E-17,...|  0.0|       0.0|
|compre mi xbox 360 con la f...|    4.0|[1.0,6.695904736345189E-17,...|  1.0|       0.0|
|(es) excelente produ

In [11]:
# evaluate
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.5868996977927786

# Random Forest

In [12]:
# model trarining
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
rfModel = rf.fit(trainingData)

# run model on test data
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("reviewText","overall","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+-------+------------------------------+-----+----------+
|                    reviewText|overall|                   probability|label|prediction|
+------------------------------+-------+------------------------------+-----+----------+
|love it! love it! love it! ...|    5.0|[0.5852702080661911,0.18720...|  0.0|       0.0|
|i think it's great but that...|    5.0|[0.5850881217327596,0.18728...|  0.0|       0.0|
|great game so far with amaz...|    5.0|[0.5850881217327596,0.18728...|  0.0|       0.0|
|absolutely. amazing. this g...|    5.0|[0.5850881217327596,0.18728...|  0.0|       0.0|
|so i waited 2 weeks to writ...|    5.0|[0.5850881217327596,0.18728...|  0.0|       0.0|
|great game its awesome its ...|    5.0|[0.5850881217327596,0.18728...|  0.0|       0.0|
|i loved this game when it w...|    5.0|[0.5850881217327596,0.18728...|  0.0|       0.0|
|these are amazing! comforta...|    5.0|[0.58436512613015,0.1871791...|  0.0|       0.0|
|this is an amazing m

In [13]:
# evaluate
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.3947844044365289

# Three Label Data

In [14]:
data.createOrReplaceTempView('data')

In [15]:
# overall=1 or 2, attitue = negative 
# overall=3, attitue = neural
# overall=4 or 5, attitue = positive
from pyspark.sql import functions as F
df=data.withColumn('attitude', F.when(F.col('overall')<3,'negative').otherwise(F.when( F.col('overall') == 3,'neutral').otherwise('positive')))

In [16]:
df.show(10)
df.groupBy("attitude") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

+-------+--------------------+--------+
|overall|          reviewText|attitude|
+-------+--------------------+--------+
|    5.0|this is a old cla...|positive|
|    4.0|this game is more...|positive|
|    5.0|if you love wwf n...|positive|
|    4.0|i had wwf wrestle...|positive|
|    4.0|i have to admit i...|positive|
|    5.0|this game was ama...|positive|
|    4.0|this right here i...|positive|
|    3.0|the rampage editi...| neutral|
|    2.0|remember the mome...|negative|
|    2.0|back in 1993 sega...|negative|
+-------+--------------------+--------+
only showing top 10 rows

+--------+------+
|attitude| count|
+--------+------+
|positive|805893|
|negative|172266|
| neutral| 97153|
+--------+------+



# Data Processing

In [17]:
label_stringIdx = StringIndexer(inputCol = "attitude", outputCol = "label")
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx])
# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)
dataset = pipelineFit.transform(df)
dataset.show(5)

+-------+--------------------+--------+--------------------+--------------------+--------------------+-----+
|overall|          reviewText|attitude|               words|            filtered|            features|label|
+-------+--------------------+--------+--------------------+--------------------+--------------------+-----+
|    5.0|this is a old cla...|positive|[this, is, a, old...|[old, classic, wr...|(10000,[0,41,51,3...|  0.0|
|    4.0|this game is more...|positive|[this, game, is, ...|[game, one, one, ...|(10000,[0,2,5,7,5...|  0.0|
|    5.0|if you love wwf n...|positive|[if, you, love, w...|[love, wwf, calle...|(10000,[0,6,20,23...|  0.0|
|    4.0|i had wwf wrestle...|positive|[i, had, wwf, wre...|[wwf, wrestlemani...|(10000,[0,1,2,3,4...|  0.0|
|    4.0|i have to admit i...|positive|[i, have, to, adm...|[admit, started, ...|(10000,[0,1,3,4,7...|  0.0|
+-------+--------------------+--------+--------------------+--------------------+--------------------+-----+
only showing top 5 

In [18]:
# set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 752381
Test Dataset Count: 322931


# Naive Bayes

In [19]:
# model training
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1)
model = nb.fit(trainingData)

# run model on test data
predictions = model.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("reviewText","attitude","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                    reviewText|attitude|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|yes, i am writing this revi...|positive|[1.0,8.786606125898127E-17,...|  0.0|       0.0|
|what is the best selling an...|positive|[1.0,8.623355319727688E-17,...|  0.0|       0.0|
|i got my sony psp about two...|positive|[1.0,8.002400118234624E-17,...|  0.0|       0.0|
|i have bought the 3g/wifi v...|positive|[1.0,5.786357441689283E-17,...|  0.0|       0.0|
|i love this system, it has ...|positive|[1.0,4.5055585060084E-17,1....|  0.0|       0.0|
|--update 12/15---ok, so aft...|positive|[1.0,3.527338995139392E-17,...|  0.0|       0.0|
|many months ago, my friend ...|positive|[1.0,3.4427716290248495E-17...|  0.0|       0.0|
|i have been playing video g...|positive|[1.0,2.32021759551896E-17,7...|  0.0|       0.0|
|okay, i'm

In [20]:
# evaluate
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.7884085277045114

# Random Forest

In [21]:
# model training
from pyspark.ml.classification import RandomForestClassifier
rf = RandomForestClassifier(labelCol="label", \
                            featuresCol="features", \
                            numTrees = 100, \
                            maxDepth = 4, \
                            maxBins = 32)
rfModel = rf.fit(trainingData)

# run model on test data
predictions = rfModel.transform(testData)
predictions.filter(predictions['prediction'] == 0) \
    .select("reviewText","attitude","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+--------+------------------------------+-----+----------+
|                    reviewText|attitude|                   probability|label|prediction|
+------------------------------+--------+------------------------------+-----+----------+
|the 3ds has an amazing libr...|positive|[0.7720868011644232,0.14251...|  0.0|       0.0|
|i just recieved this contro...|positive|[0.7720486391536012,0.14249...|  0.0|       0.0|
|i'm ready for microsoft to ...|positive|[0.7709514824506684,0.14396...|  0.0|       0.0|
|great game. if you love fig...|positive|[0.7705717607176354,0.14415...|  0.0|       0.0|
|an amazing game i have been...|positive|[0.7705717607176354,0.14415...|  0.0|       0.0|
|bright and fun. my boyfrien...|positive|[0.7705717607176354,0.14415...|  0.0|       0.0|
|easy to configure.  fits pr...|positive|[0.7705228418742072,0.14442...|  0.0|       0.0|
|daughter loves it. her favo...|positive|[0.7705014140896247,0.14425...|  0.0|       0.0|
|this was 

In [22]:
# evaluate
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6432648903486833