In [1]:
# %load "../../utils/environment.py"
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

DATA_FILE = '../../data/amazon_reviews_us_Camera_v1_00.tsv.gz'
DATA_FILE2="../../data/parquet/my-parquet-file.parquet"

APP_NAME = 'Benchmarks'

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext


In [2]:
%%time
benchmark1 = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)
benchmark1.createOrReplaceTempView("benchmark1")

CPU times: user 5.13 ms, sys: 0 ns, total: 5.13 ms
Wall time: 2.82 s


In [3]:
# benchmark1.repartition(10).write.format("parquet").mode("overwrite")\
#   .save(DATA_FILE2)

In [4]:
%%time
benchmark2 = spark.read.format("parquet") \
    .schema(review_schema) \
    .load(DATA_FILE2)
benchmark2.createOrReplaceTempView("benchmark2")

CPU times: user 4.05 ms, sys: 0 ns, total: 4.05 ms
Wall time: 206 ms


In [5]:
%%timeit
_=benchmark1.select("star_rating").groupby("star_rating").count().collect()

15.8 s ± 257 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
_=spark.sql("select star_rating, count(*) as counts from benchmark1 group by star_rating").collect()

15.3 s ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
%%timeit
_=spark.sql("select star_rating, count(*) as counts from benchmark2 group by star_rating").collect()

998 ms ± 46.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
_=benchmark2.select("star_rating").groupby("star_rating").count().collect()

798 ms ± 52.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
