# Import libs

In [15]:
# %load ../../templates/load_libs.py
import sys
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
# set project directory for shared library
PROJECT_DIR='/home/jovyan/work/amazon-review-validator'
if PROJECT_DIR not in sys.path:
    sys.path.insert(0, PROJECT_DIR)
    
from libs.utils import fill_na_mean,get_null_counts,get_correlation_target_col_length, get_distribution_col,get_unique_values

# ETL the data

In [11]:
# %load "../../utils/environment.py"
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import current_date, expr, datediff, to_date, lit, coalesce, length, regexp_replace,count,isnan,when,col

DATA_FILE = '../../data/amazon_reviews_us_Camera_v1_00.tsv.gz'
APP_NAME = 'EDA'
FEATURES = ['star_rating', 'exclam', 'helfulness', 'review_length', 'verified_purchase', 'age']
SAMPLE_SIZE = 10000

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)

df = df.na.drop(subset=["star_rating"])
df = df.fillna('',subset=['review_body'])

df = df.withColumn('review_length', length('review_body'))
df = df.withColumn('review_headline_length', length('review_headline'))
df = df.withColumn('product_title_length', length('product_title'))

df = df.withColumn('exclam',  df['review_length'] - length(regexp_replace('review_body', '\!', '')))
df = df.withColumn('positive',  (df['star_rating']>3).cast('integer'))
df = df.withColumn('age', datediff(current_date(), to_date(df['review_date'])))
df = df.drop('review_date')
df = df.withColumn('helfulness', coalesce(df['helpful_votes'] / df['total_votes'],lit(0.0)))
df = df.withColumn('verified_purchase', expr("CAST(verified_purchase='Y' As INT)"))
df = df.withColumn('vine', expr("CAST(vine='Y' As INT)"))
df = fill_na_mean(df,'age')

review_all = df.select(FEATURES)
review_sample = df.select(FEATURES).limit(SAMPLE_SIZE).cache()

review_sample.createOrReplaceTempView("samples")
review_all.createOrReplaceTempView("all")
df.createOrReplaceTempView("df")

# EDA the data
## Count nulls in each column

In [16]:
get_unique_values(df)

marketplace: 1 unique values
customer_id: 1116761 unique values
review_id: 1801972 unique values
product_id: 168673 unique values
product_parent: 153455 unique values
product_title: 154843 unique values
product_category: 1 unique values
star_rating: 5 unique values
helpful_votes: 877 unique values
total_votes: 895 unique values
vine: 2 unique values
verified_purchase: 2 unique values
review_headline: 1015325 unique values
review_body: 1696853 unique values
review_length: 9046 unique values
review_headline_length: 177 unique values
product_title_length: 400 unique values
exclam: 88 unique values
positive: 2 unique values
age: 5858 unique values
helfulness: 5079 unique values


In [13]:
get_null_counts(df)

+-----------+-----------+---------+----------+--------------+-------------+----------------+-----------+-------------+-----------+----+-----------------+---------------+-----------+-------------+----------------------+--------------------+------+--------+---+----------+
|marketplace|customer_id|review_id|product_id|product_parent|product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|review_headline|review_body|review_length|review_headline_length|product_title_length|exclam|positive|age|helfulness|
+-----------+-----------+---------+----------+--------------+-------------+----------------+-----------+-------------+-----------+----+-----------------+---------------+-----------+-------------+----------------------+--------------------+------+--------+---+----------+
|          0|          0|        0|         0|             0|            0|               0|          0|            0|          0|   0|                0|              0|          0|      

## list each review with its number of previous reviews, current average rating, and age of the review

In [10]:
spark.sql('''select review_id, product_id, age, star_rating, avg(star_rating) over (partition by product_id order by age) as mean_rating , sum(1) over (partition by product_id order by age) as current_review_count from df''').show()

+--------------+----------+----+-----------+------------------+--------------------+
|     review_id|product_id| age|star_rating|       mean_rating|current_review_count|
+--------------+----------+----+-----------+------------------+--------------------+
|R1O6Z1DDT2P1XI|B00000J47G|1810|          5|               5.0|                   1|
|R1JG9M7Z29JZ8A|B00000J47G|1816|          5|               5.0|                   2|
|R1JJGHPWABDFXV|B00000J47G|4787|          5|               5.0|                   3|
|R2RMYNWCP79YVQ|B00000J47G|4789|          1|               4.0|                   4|
|R2TBZTLKU11VZU|B00000J47G|5078|          5|               4.2|                   5|
|R14B3XKWH0WQDL|B00000J47G|5800|          3|               4.0|                   6|
| R9EVQ9YXO8N5V|B00000J47G|5816|          1|3.5714285714285716|                   7|
| RJTDPXIPVVQIA|B00000J47G|5919|          1|              3.25|                   8|
|R1K9NN4IK0S7NT|B00000J47G|6131|          4|3.3333333333333335|  

## show distribution of 'verified_purchase','helpful_votes','total_votes','vine' columns in star_rating

In [5]:
for column in ['verified_purchase','helpful_votes','total_votes','vine']:
    get_distribution_col(spark, 'star_rating', column)

+-----------+-----------------+
|star_rating|verified_purchase|
+-----------+-----------------+
|          1|           126789|
|          2|            71002|
|          3|           115780|
|          4|           276470|
|          5|           904360|
+-----------+-----------------+

+-----------+-------------+
|star_rating|helpful_votes|
+-----------+-------------+
|          1|       599024|
|          2|       293612|
|          3|       489380|
|          4|      1061154|
|          5|      2792305|
+-----------+-------------+

+-----------+-----------+
|star_rating|total_votes|
+-----------+-----------+
|          1|    1004936|
|          2|     445545|
|          3|     662408|
|          4|    1234586|
|          5|    3208903|
+-----------+-----------+

+-----------+----+
|star_rating|vine|
+-----------+----+
|          1| 143|
|          2| 357|
|          3|1139|
|          4|2951|
|          5|3293|
+-----------+----+



## show distribution of 'verified_purchase','helpful_votes','total_votes','vine' columns in star_rating

In [6]:
for target in ['star_rating','verified_purchase']:
    for col in ['review_body','review_headline','product_title']:
        get_correlation_target_col_length(spark, target=target,col=col, table='df')

+-----------+------------------------+
|star_rating|avg(length(review_body))|
+-----------+------------------------+
|          1|      431.19363881591704|
|          2|       530.5410944595322|
|          3|        523.108673830058|
|          4|        522.320591030591|
|          5|      363.85308166134377|
+-----------+------------------------+

+-----------+----------------------------+
|star_rating|avg(length(review_headline))|
+-----------+----------------------------+
|          1|          27.045951679919135|
|          2|          28.798051655323313|
|          3|          28.525484235826383|
|          4|           25.77995247995248|
|          5|           22.91408348122623|
+-----------+----------------------------+

+-----------+--------------------------+
|star_rating|avg(length(product_title))|
+-----------+--------------------------+
|          1|         88.82231703661913|
|          2|          87.6084728804055|
|          3|         87.14099392054291|
|          4| 

In [31]:
spark.sql("select customer_id, count(*) as counts from df group by customer_id order by counts desc").show(5)

+-----------+------+
|customer_id|counts|
+-----------+------+
|   31588426|   285|
|   50820654|   191|
|   52764559|   171|
|   44777060|   148|
|   52340667|   146|
+-----------+------+
only showing top 5 rows



In [30]:
spark.sql("select star_rating, count(*) as counts from df group by star_rating").show(5)

+-----------+-------+
|star_rating| counts|
+-----------+-------+
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+



In [32]:
spark.sql("select product_id, count(*) as counts from df group by product_id order by counts desc").show(5)

+----------+------+
|product_id|counts|
+----------+------+
|B006ZP8UOW|  4654|
|B00007E7JU|  4399|
|B0039BPG1A|  3619|
|B002VPE1WK|  3565|
|B0050R67U0|  3177|
+----------+------+
only showing top 5 rows



In [33]:
spark.sql("select product_id, count(*) as counts from df group by product_id order by counts desc").show(5)


+----------+------+
|product_id|counts|
+----------+------+
|B006ZP8UOW|  4654|
|B00007E7JU|  4399|
|B0039BPG1A|  3619|
|B002VPE1WK|  3565|
|B0050R67U0|  3177|
+----------+------+
only showing top 5 rows

