In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("spark-nlp") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.11:2.4.5") \
    .getOrCreate()

In [2]:
spark

In [3]:
sc = spark.sparkContext 
sc

In [4]:
data = spark.read.json("s3://qianyielva/video")

In [5]:
data = data.drop("helpful").drop("reviewerID").drop("unixReviewTime").drop("reviewTime").drop("reviewerName")

In [6]:
from pyspark.sql.types import IntegerType
data = data.withColumn("overall", data["overall"].cast(IntegerType()))

In [7]:
data.show(10)

+----------+-------+--------------------+--------------------+
|      asin|overall|          reviewText|             summary|
+----------+-------+--------------------+--------------------+
|0700099867|      1|Installing the ga...|Pay to unlock con...|
|0700099867|      4|If you like rally...|     Good rally game|
|0700099867|      1|1st shipment rece...|           Wrong key|
|0700099867|      3|I got this versio...|awesome game, if ...|
|0700099867|      4|I had Dirt 2 on X...|              DIRT 3|
|0700099867|      4|Overall this is a...|Good racing game,...|
|0700099867|      5|Loved playing Dir...|A step up from Di...|
|0700099867|      1|I can't tell you ...|Crash 3 is correc...|
|0700099867|      4|I initially gave ...|A great game ruin...|
|0700099867|      2|I still haven't f...|Couldn't get this...|
+----------+-------+--------------------+--------------------+
only showing top 10 rows



In [9]:
from pyspark.sql import functions as F

In [13]:
# overall=1 or 2, attitue =1 (negative) 
# overall=3, attitue =2 (neural)
# overall=4 or 5, attitue =3 (positive)
data=data.withColumn('attitude', F.when(F.col('overall')<3,1).otherwise(F.when( F.col('overall') == 3,2).otherwise(3)))

In [14]:
data.show(5)

+----------+-------+--------------------+--------------------+--------+
|      asin|overall|          reviewText|             summary|attitude|
+----------+-------+--------------------+--------------------+--------+
|0700099867|      1|Installing the ga...|Pay to unlock con...|       1|
|0700099867|      4|If you like rally...|     Good rally game|       3|
|0700099867|      1|1st shipment rece...|           Wrong key|       1|
|0700099867|      3|I got this versio...|awesome game, if ...|       2|
|0700099867|      4|I had Dirt 2 on X...|              DIRT 3|       3|
+----------+-------+--------------------+--------------------+--------+
only showing top 5 rows



In [16]:
from pyspark.sql.types import DoubleType
data = data.withColumn("attitude", data["attitude"].cast(DoubleType()))

In [17]:
data=data.drop("overall")

In [18]:
data=data.withColumn("overall", data["attitude"])

In [20]:
data=data.drop("attitude")

In [21]:
data.show(5)

+----------+--------------------+--------------------+-------+
|      asin|          reviewText|             summary|overall|
+----------+--------------------+--------------------+-------+
|0700099867|Installing the ga...|Pay to unlock con...|    1.0|
|0700099867|If you like rally...|     Good rally game|    3.0|
|0700099867|1st shipment rece...|           Wrong key|    1.0|
|0700099867|I got this versio...|awesome game, if ...|    2.0|
|0700099867|I had Dirt 2 on X...|              DIRT 3|    3.0|
+----------+--------------------+--------------------+-------+
only showing top 5 rows



# data cleaning-nlp pipeline 

In [22]:
import nltk
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /home/hadoop/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/hadoop/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [23]:
# get the list of stopwords from nltk
from nltk.corpus import stopwords

eng_stopwords = stopwords.words('english')
eng_stopwords.append('xxxx')

In [24]:
from sparknlp.base import Finisher, DocumentAssembler
from sparknlp.annotator import (Tokenizer, Normalizer, 
                                LemmatizerModel, StopWordsCleaner)
from pyspark.ml import Pipeline

In [25]:
documentAssembler = DocumentAssembler() \
    .setInputCol('reviewText') \
    .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(['document']) \
    .setOutputCol('token')

# note normalizer defaults to changing all words to lowercase.
# Use .setLowercase(False) to maintain input case.
normalizer = Normalizer() \
    .setInputCols(['token']) \
    .setOutputCol('normalized') \
    .setLowercase(True)

# note that lemmatizer needs a dictionary. So I used the pre-trained
# model (note that it defaults to english)
lemmatizer = LemmatizerModel.pretrained() \
    .setInputCols(['normalized']) \
    .setOutputCol('lemma') \

stopwords_cleaner = StopWordsCleaner() \
    .setInputCols(['lemma']) \
    .setOutputCol('clean_lemma') \
    .setCaseSensitive(False) \
    .setStopWords(eng_stopwords)

# finisher converts tokens to human-readable output
finisher = Finisher() \
    .setInputCols(['clean_lemma']) \
    .setCleanAnnotations(False)

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [26]:
pipeline = Pipeline() \
    .setStages([
        documentAssembler,
        tokenizer,
        normalizer,
        lemmatizer,
        stopwords_cleaner,
        finisher
    ])

In [27]:
data.createOrReplaceTempView("data")
df = spark.sql("SELECT reviewText FROM data")
df.show(10)

+--------------------+
|          reviewText|
+--------------------+
|Installing the ga...|
|If you like rally...|
|1st shipment rece...|
|I got this versio...|
|I had Dirt 2 on X...|
|Overall this is a...|
|Loved playing Dir...|
|I can't tell you ...|
|I initially gave ...|
|I still haven't f...|
+--------------------+
only showing top 10 rows



In [28]:
equifax = pipeline.fit(df).transform(df)
temp = equifax.select('finished_clean_lemma')
temp.show(10)

+--------------------+
|finished_clean_lemma|
+--------------------+
|[install, game, s...|
|[like, rally, car...|
|[st, shipment, re...|
|[get, version, in...|
|[dirt, xbox, okay...|
|[overall, well, r...|
|[love, play, dirt...|
|[cant, tell, piec...|
|[initially, give,...|
|[still, havent, f...|
+--------------------+
only showing top 10 rows



In [29]:
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window
data=data.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))
temp=temp.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))
data = data.join(temp, on=["row_index"]).drop("row_index")
data.show(10)

+----------+--------------------+--------------------+-------+--------------------+
|      asin|          reviewText|             summary|overall|finished_clean_lemma|
+----------+--------------------+--------------------+-------+--------------------+
|0700099867|Installing the ga...|Pay to unlock con...|    1.0|[install, game, s...|
|0700099867|If you like rally...|     Good rally game|    3.0|[like, rally, car...|
|0700099867|1st shipment rece...|           Wrong key|    1.0|[st, shipment, re...|
|0700099867|I got this versio...|awesome game, if ...|    2.0|[get, version, in...|
|0700099867|I had Dirt 2 on X...|              DIRT 3|    3.0|[dirt, xbox, okay...|
|0700099867|Overall this is a...|Good racing game,...|    3.0|[overall, well, r...|
|0700099867|Loved playing Dir...|A step up from Di...|    3.0|[love, play, dirt...|
|0700099867|I can't tell you ...|Crash 3 is correc...|    1.0|[cant, tell, piec...|
|0700099867|I initially gave ...|A great game ruin...|    3.0|[initially, gi

In [30]:
data.cache()

DataFrame[asin: string, reviewText: string, summary: string, overall: double, finished_clean_lemma: array<string>]

In [None]:
splitted_data = data.randomSplit([0.8, 0.2])
train_data = splitted_data[0]
test_data = splitted_data[1]

print("Number of training records: " + str(train_data.count()))
print("Number of testing records : " + str(test_data.count()))

# TF-IDF (Logistic Regression) with cross validation 

In [41]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator


In [35]:
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol = "overall", outputCol = "label")
pipeline1 = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

In [36]:
pipelineFit1 = pipeline1.fit(data)
dataset1 = pipelineFit1.transform(data)

In [37]:
lr2 = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [38]:
(trainingData1, testData1) = dataset1.randomSplit([0.8, 0.2], seed = 100)

print("Number of training records: " + str(trainingData1.count()))
print("Number of testing records : " + str(testData1.count()))

Number of training records: 185208
Number of testing records : 46572


In [39]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr2.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr2.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())

In [42]:
cv1 = CrossValidator(estimator=lr2, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=MulticlassClassificationEvaluator(), \
                    numFolds=5)

In [None]:
cvModel1 = cv1.fit(trainingData1)

In [None]:
predictions = cvModel1.transform(testData1)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

# Countvector approach (Logistic Regression) with cross validation 

In [None]:
lr1 = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)

In [None]:
label_stringIdx = StringIndexer(inputCol = "overall", outputCol = "label")

In [None]:
countVectors = CountVectorizer(inputCol="finished_clean_lemma", outputCol="features", vocabSize=10000, minDF=5)

In [None]:
data.show(5)

In [None]:
pipeline = Pipeline(stages=[countVectors, label_stringIdx])
pipelineFit = pipeline.fit(data)
dataset = pipelineFit.transform(data)

In [None]:
(trainingData, testData) = dataset.randomSplit([0.8, 0.2], seed = 100)
print("Number of training records: " + str(trainingData.count()))
print("Number of testing records : " + str(testData.count()))

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(lr1.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr1.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
             .build())

In [None]:
cv = CrossValidator(estimator=lr1, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=MulticlassClassificationEvaluator(), \
                    numFolds=5)

In [None]:
cvModel = cv.fit(trainingData)

In [None]:
predictions = cvModel.transform(testData)
# Evaluate best model
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)