In [0]:
from pyspark.sql.functions import lit

# Load in one of the tables
df = spark.sql("select * from default.reviews_train")

# Take a sample (useful for code development purposes)
df = df.sample(False, 0.15, seed=0)

df = df.cache()

print((df.count(), len(df.columns)))

(470645, 11)


In [0]:
df.printSchema()

root
 |-- reviewID: integer (nullable = true)
 |-- overall: double (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: integer (nullable = true)
 |-- label: integer (nullable = true)



In [0]:
# Let's look at some quick summary statistics
df.describe().show()

+-------+------------------+------------------+----------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+-------------------+
|summary|          reviewID|           overall|reviewTime|          reviewerID|                asin|  reviewerName|          reviewText|             summary|      unixReviewTime|              label|
+-------+------------------+------------------+----------+--------------------+--------------------+--------------+--------------------+--------------------+--------------------+-------------------+
|  count|            470645|            470645|    470645|              470645|              470645|        470612|              470645|              470585|              470645|             470645|
|   mean|1569466.7378533715| 4.316557065303997|      null|                null| 4.295464796096894E7|           NaN|  179.13333333333333|           50819.075|1.3922720677266304E9|0.17867819694249382|
| std

In [0]:
from pyspark.sql.functions import col
display(df.groupBy("overall").count().orderBy("overall"))

overall,count
1.0,24375
2.0,20812
3.0,39424
4.0,82875
5.0,303159


In [0]:
# The most common product IDs
display(df.groupBy("asin").count().orderBy(col("count").desc()).head(50))

asin,count
0007420412,2681
000711835X,2468
0007548672,2171
0007350899,1101
0007444117,1076
B000YGEVMI,958
0007378033,855
B0015TMHSI,782
0007384289,682
006017322X,677


In [0]:
display(df.groupBy("label").count().orderBy("label"))

label,count
0,386551
1,84094


In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.sql import functions as f

# We'll tokenize the text using a simple RegexTokenizer
tokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W")

# Remove standard Stopwords
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered")

pipeline = Pipeline(stages=[tokenizer, stopwordsRemover])

pipelineFit = pipeline.fit(df)
df = pipelineFit.transform(df)

In [0]:
counts = df.select(f.explode('filtered').alias('col')).groupBy('col').count().sort(f.desc('count')).collect()
display(counts)

col,count
one,167556
book,165187
game,141451
great,131777
like,129426
good,106801
well,94366
read,89533
time,83685
get,80686
