In [16]:
# %load "../../utils/environment.py"
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

DATA_FILE = '../../data/amazon_reviews_us_Camera_v1_00.tsv.gz'
APP_NAME = 'EDA'
FEATURES = ['star_rating', 'review_body', 'helpful_votes', 'total_votes', 'verified_purchase', 'review_date']
SAMPLE_SIZE = 10000

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)
df.createOrReplaceTempView("dfTable")

review_all = df.select(FEATURES)
review_sample = df.select(FEATURES).limit(SAMPLE_SIZE).cache()


In [17]:
spark.sql("select customer_id, count(*) as counts from dfTable group by customer_id order by counts desc").show()

+-----------+------+
|customer_id|counts|
+-----------+------+
|   31588426|   285|
|   50820654|   191|
|   52764559|   171|
|   44777060|   148|
|   52340667|   146|
|   45664110|   145|
|    9115336|   140|
|   53090839|   130|
|   52859210|   129|
|   27140716|   126|
|   24550970|   126|
|   45371561|   126|
|   40109303|   123|
|   51865210|   120|
|   38681283|   114|
|   22401847|   112|
|   52770861|   107|
|   48640632|   104|
|   52765756|   103|
|   50906184|   102|
+-----------+------+
only showing top 20 rows



In [18]:
%%time
spark.sql("select star_rating, count(*) as counts from dfTable group by star_rating").show()

+-----------+-------+
|star_rating| counts|
+-----------+-------+
|       null|      2|
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+

CPU times: user 4.29 ms, sys: 602 µs, total: 4.89 ms
Wall time: 16 s
