In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()

In [2]:
spark

In [3]:
sc = spark.sparkContext 
sc

In [6]:
data = spark.read.json("s3://qianyielva/video")

In [7]:
data = data.drop("helpful").drop("reviewerID").drop("unixReviewTime").drop("reviewTime").drop("reviewerName")

In [8]:
from pyspark.sql.types import IntegerType
data = data.withColumn("overall", data["overall"].cast(IntegerType()))

In [9]:
data.show(10)

+----------+-------+--------------------+--------------------+
|      asin|overall|          reviewText|             summary|
+----------+-------+--------------------+--------------------+
|0700099867|      1|Installing the ga...|Pay to unlock con...|
|0700099867|      4|If you like rally...|     Good rally game|
|0700099867|      1|1st shipment rece...|           Wrong key|
|0700099867|      3|I got this versio...|awesome game, if ...|
|0700099867|      4|I had Dirt 2 on X...|              DIRT 3|
|0700099867|      4|Overall this is a...|Good racing game,...|
|0700099867|      5|Loved playing Dir...|A step up from Di...|
|0700099867|      1|I can't tell you ...|Crash 3 is correc...|
|0700099867|      4|I initially gave ...|A great game ruin...|
|0700099867|      2|I still haven't f...|Couldn't get this...|
+----------+-------+--------------------+--------------------+
only showing top 10 rows



In [10]:
data.cache()

DataFrame[asin: string, overall: int, reviewText: string, summary: string]

In [11]:
import nltk
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [12]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')

In [13]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [14]:
documentAssembler = DocumentAssembler() \
    .setInputCol('reviewText') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [15]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [16]:
data.createOrReplaceTempView("data")
df = spark.sql("SELECT reviewText FROM data")
df.show(10)

+--------------------+
|          reviewText|
+--------------------+
|Installing the ga...|
|If you like rally...|
|1st shipment rece...|
|I got this versio...|
|I had Dirt 2 on X...|
|Overall this is a...|
|Loved playing Dir...|
|I can't tell you ...|
|I initially gave ...|
|I still haven't f...|
+--------------------+
only showing top 10 rows



In [17]:
equifax = pipeline.fit(df).transform(df)
temp = equifax.select('finished_clean_lemma')
temp.show(10)

+--------------------+
|finished_clean_lemma|
+--------------------+
|[install, game, s...|
|[like, rally, car...|
|[st, shipment, re...|
|[get, version, in...|
|[dirt, xbox, okay...|
|[overall, well, r...|
|[love, play, dirt...|
|[cant, tell, piec...|
|[initially, give,...|
|[still, havent, f...|
+--------------------+
only showing top 10 rows



In [18]:
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window
data=data.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))
temp=temp.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))
data = data.join(temp, on=["row_index"]).drop("row_index")
data.show(10)

+----------+-------+--------------------+--------------------+--------------------+
|      asin|overall|          reviewText|             summary|finished_clean_lemma|
+----------+-------+--------------------+--------------------+--------------------+
|0700099867|      1|Installing the ga...|Pay to unlock con...|[install, game, s...|
|0700099867|      4|If you like rally...|     Good rally game|[like, rally, car...|
|0700099867|      1|1st shipment rece...|           Wrong key|[st, shipment, re...|
|0700099867|      3|I got this versio...|awesome game, if ...|[get, version, in...|
|0700099867|      4|I had Dirt 2 on X...|              DIRT 3|[dirt, xbox, okay...|
|0700099867|      4|Overall this is a...|Good racing game,...|[overall, well, r...|
|0700099867|      5|Loved playing Dir...|A step up from Di...|[love, play, dirt...|
|0700099867|      1|I can't tell you ...|Crash 3 is correc...|[cant, tell, piec...|
|0700099867|      4|I initially gave ...|A great game ruin...|[initially, gi

In [19]:
data.cache()

DataFrame[asin: string, overall: int, reviewText: string, summary: string, finished_clean_lemma: array<string>]

In [20]:
splitted_data = data.randomSplit([0.8, 0.2])
train_data = splitted_data[0]
test_data = splitted_data[1]

print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))

Number of training records: 185748
Number of testing records : 46032


# TF-IDF modeling 

In [21]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline



In [22]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "overall", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_data)
train_df = pipelineFit.transform(train_data)
test_df = pipelineFit.transform(test_data)
train_df.show(5)

+----------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|      asin|overall|          reviewText|             summary|finished_clean_lemma|               words|                  tf|            features|label|
+----------+-------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|0700099867|      1|1st shipment rece...|           Wrong key|[st, shipment, re...|[1st, shipment, r...|(65536,[568,6534,...|(65536,[568,6534,...|  3.0|
|0700099867|      1|Crashed in Vista....|Don't waste your ...|[crash, vista, co...|[crashed, in, vis...|(65536,[4775,8315...|(65536,[4775,8315...|  3.0|
|0700099867|      1|DiRT 2 was like t...|The first one was...|[dirt, like, im, ...|[dirt, 2, was, li...|(65536,[1672,1706...|(65536,[1672,1706...|  3.0|
|0700099867|      1|I bought this and...|It might have bee...|[buy, key, didnt,...

In [23]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=100)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(test_df)

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.4477438794689748