In [0]:
# Load in the table
df = spark.sql("select * from default.reviews_train").sample(0.1)

df = df.cache()

print((df.count(), len(df.columns)))

(313825, 11)


In [0]:
df.printSchema()

root
 |-- reviewID: integer (nullable = true)
 |-- overall: double (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: integer (nullable = true)
 |-- label: integer (nullable = true)



In [0]:
# For our intitial modeling efforts, we are not going to use the following features
drop_list = ['summary', 'asin', 'reviewID', 'reviewerID', 'summary', 'unixReviewTime','reviewTime', 'image', 'style', 'reviewerName']
df = df.select([column for column in df.columns if column not in drop_list])
df = df.na.drop(subset=["reviewText", "label"])
df.show(5)
print((df.count(), len(df.columns)))

+-------+--------+--------------------+-----+
|overall|verified|          reviewText|label|
+-------+--------+--------------------+-----+
|    5.0|    true|This game is a bi...|    0|
|    5.0|    true|These mugs do wha...|    0|
|    3.0|   false|41 years later: T...|    0|
|    4.0|    true|           Good game|    0|
|    5.0|    true|Dirt 3 on DVD i c...|    0|
+-------+--------+--------------------+-----+
only showing top 5 rows

(313825, 4)


In [0]:
# In Spark's MLLib, it's considered good practice to combine all the preprocessing steps into a pipeline.
# That way, you can run the same steps on both the training data, and testing data and beyond (new data)
# without copying and pasting any code.

# It is possible to run all of these steps one-by-one, outside of a Pipeline, if desired. But that's
# not how I am going to do it here.

from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, VectorAssembler, IDF
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, NaiveBayes
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# We'll tokenize the text using a simple RegexTokenizer
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")

# Remove standard Stopwords
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")

# TODO: insert other clearning steps here (and put into the pipeline, of course!)
# E.g., n-grams? document length?


# Vectorize the sentences using simple BOW method. Other methods are possible:
# https://spark.apache.org/docs/2.2.0/ml-features.html#feature-extractors
tf = CountVectorizer(inputCol="filtered", outputCol="rawFeatures", vocabSize=2000, minTF=1, maxDF=0.40)

# Generate Inverse Document Frequency weighting
idf = IDF(inputCol="rawFeatures", outputCol="idfFeatures", minDocFreq=100)

# Combine all features into one final "features" column
assembler = VectorAssembler(inputCols=["verified", "overall", "idfFeatures"], outputCol="features")

# Machine Learning Algorithm
ml_alg  = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.0)

pipeline = Pipeline(stages=[tokenizer, stopwordsRemover, tf, idf, assembler, ml_alg])

paramGrid = ParamGridBuilder() \
    .addGrid(ml_alg.regParam, [0.3, 0.5, 0.7]) \
    .addGrid(ml_alg.elasticNetParam, [0.0]) \
    .addGrid(tf.minTF, [1, 100, 1000]) \
    .addGrid(tf.vocabSize, [500, 1000, 2500, 5000]) \
    .build()


In [0]:
# set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.9, 0.1], seed = 47)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count:     " + str(testData.count()))

Training Dataset Count: 282703
Test Dataset Count:     31122


In [0]:
pipelineFit = pipeline.fit(trainingData)

In [0]:
predictions = pipelineFit.transform(testData)
predictions.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|29386|
|       1.0| 1736|
+----------+-----+



In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

acc_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
pre_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")
rec_evaluator = MulticlassClassificationEvaluator(metricName="weightedRecall")
pr_evaluator  = BinaryClassificationEvaluator(metricName="areaUnderPR")
auc_evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")

#print("Test Accuracy       = %g" % (acc_evaluator.evaluate(predictions)))
#print("Test Precision      = %g" % (pre_evaluator.evaluate(predictions)))
#print("Test Recall         = %g" % (rec_evaluator.evaluate(predictions)))
#print("Test areaUnderPR    = %g" % (pr_evaluator.evaluate(predictions)))
print("Test areaUnderROC   = %g" % (auc_evaluator.evaluate(predictions)))

Test areaUnderROC   = 0.836534


In [0]:
test_df = spark.sql("select * from default.reviews_test")
kaggle_pred = pipelineFit.transform(test_df)
kaggle_pred.show(5)
kaggle_pred.groupBy("prediction").count().show()

+--------+-------+--------+-----------+--------------+----------+------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|reviewID|overall|verified| reviewTime|    reviewerID|      asin|reviewerName|          reviewText|             summary|unixReviewTime|               words|            filtered|         rawFeatures|         idfFeatures|            features|       rawPrediction|         probability|prediction|
+--------+-------+--------+-----------+--------------+----------+------------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|80000001|    4.0|   false|07 27, 2015|A1JGAP0185YJI6|0700026657|      travis|I played it a whi...|But in spite of t..

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

probelement=udf(lambda v:float(v[1]),FloatType())
submission_data = kaggle_pred.select('reviewID', probelement('probability')).withColumnRenamed('<lambda>(probability)', 'label')

In [0]:
# Download this and submit to Kaggle!
display(submission_data.select(["reviewID", "label"]))

reviewID,label
80000001,0.14093284
80000002,0.20602989
80000003,0.10382544
80000004,0.44697323
80000005,0.35459656
80000006,0.23288304
80000007,0.14883634
80000008,0.7620642
80000009,0.11907108
80000010,0.6763279
