In [1]:
# %load "../../utils/environment.py"
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

DATA_FILE = '../../data/amazon_reviews_us_Camera_v1_00.tsv.gz'
DATA_FILE2="../../data/parquet/my-parquet-file.parquet"

APP_NAME = 'Benchmarks'

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext


In [11]:
%%time
benchmark1 = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)
benchmark1.createOrReplaceTempView("benchmark1")

CPU times: user 147 µs, sys: 4.24 ms, total: 4.38 ms
Wall time: 95.7 ms


In [3]:
# benchmark1.repartition(10).write.format("parquet").mode("overwrite")\
#   .save(DATA_FILE2)

In [12]:
%%time
benchmark2 = spark.read.format("parquet") \
    .schema(review_schema) \
    .load(DATA_FILE2)
benchmark2.createOrReplaceTempView("benchmark2")

CPU times: user 4.23 ms, sys: 887 µs, total: 5.12 ms
Wall time: 60.2 ms


In [9]:
%%timeit
_=benchmark1.select("star_rating").groupby("star_rating").count().show()

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|     

In [None]:
%%timeit
_=spark.sql("select star_rating, count(*) as counts from benchmark1 group by star_rating").show()

In [None]:
%%timeit
_=spark.sql("select star_rating, count(*) as counts from benchmark2 group by star_rating").show()

In [10]:
%%timeit
_=benchmark2.select("star_rating").groupby("star_rating").count().show()

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

+-----------+-------+
|star_rating|  count|
+-----------+-------+
|       null|      2|
|          1| 170157|
|     