In [3]:
# %load ../../templates/load_libs.py
import sys
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
# set project directory for shared library
PROJECT_DIR='/home/jovyan/work/amazon-review-validator'
if PROJECT_DIR not in sys.path:
    sys.path.insert(0, PROJECT_DIR)
    
from libs.utils import hello
hello()

hello works


In [7]:
# %load ../../templates/load_data.py
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType

DATA_FILE = '../../data/amazon_reviews_us_Camera_v1_00.tsv.gz'
APP_NAME = 'EDA'
FEATURES = ['star_rating', 'review_body', 'helpful_votes', 'total_votes', 'verified_purchase', 'review_date']
SAMPLE_SIZE = 10000

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)
df.createOrReplaceTempView("dfTable")

review_all = df.select(FEATURES)
review_sample = df.select(FEATURES).limit(SAMPLE_SIZE).cache()


In [26]:
spark.sql(
        "select customer_id, count(if (star_rating==1,1,NULL)) as one_star, count(if (star_rating==2,1,NULL)) as two_star, count(if (star_rating==3,1,NULL)) as tre_star, count(if (star_rating==4,1,NULL)) as qua_star, count(if (star_rating==5,1,NULL)) as cin_star from eda_sql_view group by customer_id order by one_star desc ").show()

+-----------+--------+--------+--------+--------+--------+
|customer_id|one_star|two_star|tre_star|qua_star|cin_star|
+-----------+--------+--------+--------+--------+--------+
|   11298115|      28|       2|       0|       0|       2|
|   12572844|      26|       0|       0|       0|       0|
|   49010601|      24|       0|       0|       0|       0|
|   47129351|      24|       1|       0|       0|       0|
|   35955273|      21|       0|       5|       1|      45|
|   25109130|      20|       0|       0|       0|       3|
|   10430957|      19|       0|       0|       0|       0|
|   50350415|      19|       8|       5|      18|      27|
|   45420831|      17|       0|       0|       0|       0|
|   36814878|      16|       2|       5|       2|      19|
|   52614016|      16|      15|       7|       8|      23|
|   12761652|      16|       0|       0|       0|       0|
|    8199747|      16|       1|       2|       1|      16|
|   13027341|      14|       3|       0|       0|       

In [27]:
spark.sql(
        "select customer_id, count(if (star_rating==1,1,NULL)) as one_star, count(if (star_rating==2,1,NULL)) as two_star, count(if (star_rating==3,1,NULL)) as tre_star, count(if (star_rating==4,1,NULL)) as qua_star, count(if (star_rating==5,1,NULL)) as cin_star from eda_sql_view group by customer_id order by cin_star desc ").show()

+-----------+--------+--------+--------+--------+--------+
|customer_id|one_star|two_star|tre_star|qua_star|cin_star|
+-----------+--------+--------+--------+--------+--------+
|   31588426|       4|       2|       7|      59|     213|
|    9115336|       0|       0|       1|      17|     122|
|   45371561|       0|       1|       3|       6|     116|
|   38681283|       0|       1|       3|       6|     104|
|   52340667|      11|       7|       9|      16|     103|
|   40109303|       0|       0|       0|      20|     103|
|   50820654|       1|       2|      15|      70|     103|
|   52859210|       9|       2|      10|      13|      95|
|   52764559|       9|      10|      12|      49|      91|
|   48640632|       0|       0|       4|      10|      90|
|   44777060|       1|       1|      11|      45|      90|
|   16255502|       1|       0|       2|       1|      89|
|    2840168|       0|       0|       1|       0|      89|
|   53017806|       0|       0|       0|       6|      8

In [28]:
spark.sql(
    "SELECT  (COUNT(IF (verified_purchase == 'Y', 1, NULL))/COUNT(*)) as percentage_verified_purchase FROM eda_sql_view").show()

+----------------------------+
|percentage_verified_purchase|
+----------------------------+
|          0.8293132975281552|
+----------------------------+



In [29]:
spark.sql(
    "SELECT  (COUNT(IF (star_rating >3, 1, NULL))/COUNT(*)) as percentage_star_rating FROM eda_sql_view").show()

+----------------------+
|percentage_star_rating|
+----------------------+
|    0.7765961107096995|
+----------------------+



In [30]:
spark.sql(
    "select verified_purchase, count(verified_purchase) as counts from EDA_sql_view group by verified_purchase order by counts desc ").show()

+-----------------+-------+
|verified_purchase| counts|
+-----------------+-------+
|                Y|1494401|
|                N| 307571|
|             null|      0|
+-----------------+-------+



In [31]:
spark.sql(
    "select star_rating, count(star_rating) as counts from EDA_sql_view group by star_rating order by counts desc ").show()

+-----------+-------+
|star_rating| counts|
+-----------+-------+
|          5|1062706|
|          4| 336700|
|          1| 170157|
|          3| 141460|
|          2|  90949|
|       null|      0|
+-----------+-------+



In [2]:
spark.sql("select avg(helpful_votes) as average, min(helpful_votes) as min, max(helpful_votes) as max from eda_sql_view").show()


+-----------------+---+----+
|          average|min| max|
+-----------------+---+----+
|2.905414179576597|  0|5132|
+-----------------+---+----+



In [3]:
spark.sql("select avg(review_body) as average, min(review_body) as min, max(review_body) as max from eda_sql_view").show()

+------------------+---+------+
|           average|min|   max|
+------------------+---+------+
|16.515151515151516| |🚘🎥👍|
+------------------+---+------+



In [4]:
spark.sql(
    "select avg(length(review_body)) as average, min(length(review_body)) as min, max(length(review_body)) as max from eda_sql_view").show()

+-----------------+---+-----+
|          average|min|  max|
+-----------------+---+-----+
|420.7652951133557|  1|48929|
+-----------------+---+-----+



In [16]:
len(df.columns)

15

In [14]:
spark.sql(
   "Select count( distinct product_id) from eda_sql_view").show()

+--------------------------+
|count(DISTINCT product_id)|
+--------------------------+
|                    168675|
+--------------------------+

