# Import libs

In [1]:
# %load ../../templates/load_libs.py
import sys
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
# set project directory for shared library
PROJECT_DIR='/home/jovyan/work/amazon-review-validator'
if PROJECT_DIR not in sys.path:
    sys.path.insert(0, PROJECT_DIR)
    
from libs.utils import get_distribution_col,fill_na_mean,get_null_counts,get_correlation_target_col_length

# ETL the data

In [24]:
# %load "../../utils/environment.py"
from pyspark.ml.classification import LogisticRegression, NaiveBayes, DecisionTreeClassifier, GBTClassifier, \
    RandomForestClassifier
import pyspark as ps
from pyspark.sql.types import StructField, StructType, StringType, IntegerType
from pyspark.sql.functions import current_date, expr, datediff, to_date, lit, coalesce, length, regexp_replace,count,isnan,when,col

DATA_FILE = '../../data/amazon_reviews_us_Camera_v1_00.tsv.gz'
APP_NAME = 'EDA'
FEATURES = ['star_rating', 'exclam', 'helfulness', 'review_length', 'verified_purchase', 'age']
SAMPLE_SIZE = 10000

review_schema = StructType(
    [StructField('marketplace', StringType(), True),
     StructField('customer_id', StringType(), True),
     StructField('review_id', StringType(), True),
     StructField('product_id', StringType(), True),
     StructField('product_parent', StringType(), True),
     StructField('product_title', StringType(), True),
     StructField('product_category', StringType(), True),
     StructField('star_rating', IntegerType(), True),
     StructField('helpful_votes', IntegerType(), True),
     StructField('total_votes', IntegerType(), True),
     StructField('vine', StringType(), True),
     StructField('verified_purchase', StringType(), True),
     StructField('review_headline', StringType(), True),
     StructField('review_body', StringType(), True),
     StructField('review_date', StringType(), True)])

spark = (ps.sql.SparkSession.builder
         .master("local[1]")
         .appName(APP_NAME)
         .getOrCreate()
         )
sc = spark.sparkContext

df = spark.read.format("csv") \
    .option("header", "true") \
    .option("sep", "\t") \
    .schema(review_schema) \
    .load(DATA_FILE)

df = df.na.drop(subset=["star_rating"])
df = df.fillna('',subset=['review_body'])

df = df.withColumn('review_length', length('review_body'))
df = df.withColumn('review_headline_length', length('review_headline'))
df = df.withColumn('product_title_length', length('product_title'))

df = df.withColumn('exclam',  df['review_length'] - length(regexp_replace('review_body', '\!', '')))
df = df.withColumn('positive',  (df['star_rating']>3).cast('integer'))
df = df.withColumn('age', datediff(current_date(), to_date(df['review_date'])))
df = df.withColumn('helfulness', coalesce(df['helpful_votes'] / df['total_votes'],lit(0.0)))
df = df.withColumn('label', expr("CAST(verified_purchase='Y' As INT)"))
df = df.withColumn('vine', expr("CAST(vine='Y' As INT)"))
df=fill_na_mean(df,'age')

review_all = df.select(FEATURES)
review_sample = df.select(FEATURES).limit(SAMPLE_SIZE).cache()

review_sample.createOrReplaceTempView("samples")
review_all.createOrReplaceTempView("all")
df.createOrReplaceTempView("df")

# EDA the data
## Count nulls in each column

In [8]:
get_null_counts(df)

+-----------+-----------+---------+----------+--------------+-------------+----------------+-----------+-------------+-----------+----+-----------------+---------------+-----------+-----------+-------------+----------------------+--------------------+------+--------+---+----------+-----+
|marketplace|customer_id|review_id|product_id|product_parent|product_title|product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|review_headline|review_body|review_date|review_length|review_headline_length|product_title_length|exclam|positive|age|helfulness|label|
+-----------+-----------+---------+----------+--------------+-------------+----------------+-----------+-------------+-----------+----+-----------------+---------------+-----------+-----------+-------------+----------------------+--------------------+------+--------+---+----------+-----+
|          0|          0|        0|         0|             0|            0|               0|          0|            0|          0|   

## list each review with its number of previous reviews, current average rating, and age of the review

In [2]:
spark.sql('''select review_id, product_id, age, star_rating, sum(1) over (partition by product_id order by age) as accum_review_count from df''').show()

NameError: name 'spark' is not defined

In [6]:
spark.sql('''select review_id, product_id, age, star_rating, avg(star_rating) over (partition by product_id order by age) as accum_rating from df''').show()

+--------------+----------+----+-----------+------------------+
|     review_id|product_id| age|star_rating|      accum_rating|
+--------------+----------+----+-----------+------------------+
|R1O6Z1DDT2P1XI|B00000J47G|1809|          5|               5.0|
|R1JG9M7Z29JZ8A|B00000J47G|1815|          5|               5.0|
|R1JJGHPWABDFXV|B00000J47G|4786|          5|               5.0|
|R2RMYNWCP79YVQ|B00000J47G|4788|          1|               4.0|
|R2TBZTLKU11VZU|B00000J47G|5077|          5|               4.2|
|R14B3XKWH0WQDL|B00000J47G|5799|          3|               4.0|
| R9EVQ9YXO8N5V|B00000J47G|5815|          1|3.5714285714285716|
| RJTDPXIPVVQIA|B00000J47G|5918|          1|              3.25|
|R1K9NN4IK0S7NT|B00000J47G|6130|          4|3.3333333333333335|
| R36SG72DYDIQL|B00000J47G|6414|          4|               3.4|
| R5OS4MY8RBGZG|B00000J47G|6415|          5|3.6666666666666665|
|R3LVY8TXNJ733Q|B00000J47G|6415|          5|3.6666666666666665|
|R215WFCN9MNL6G|B00000J47G|6433|        

## show distribution of 'verified_purchase','helpful_votes','total_votes','vine' columns in star_rating

In [3]:
for column in ['verified_purchase','helpful_votes','total_votes','vine']:
    get_distribution_col('star_rating', column)

NameError: name 'get_distribution_col' is not defined

## show distribution of 'verified_purchase','helpful_votes','total_votes','vine' columns in star_rating

In [22]:
for target in ['star_rating','verified_purchase']:
    for col in ['review_body','review_headline','product_title']:
        get_correlation_target_col_length(spark, target=target,col=col, table='df')

TypeError: get_correlation_target_col_length() got multiple values for argument 'target'

In [31]:
spark.sql("select customer_id, count(*) as counts from df group by customer_id order by counts desc").show(5)

+-----------+------+
|customer_id|counts|
+-----------+------+
|   31588426|   285|
|   50820654|   191|
|   52764559|   171|
|   44777060|   148|
|   52340667|   146|
+-----------+------+
only showing top 5 rows



In [30]:
spark.sql("select star_rating, count(*) as counts from df group by star_rating").show(5)

+-----------+-------+
|star_rating| counts|
+-----------+-------+
|          1| 170157|
|          3| 141460|
|          5|1062706|
|          4| 336700|
|          2|  90949|
+-----------+-------+



In [32]:
spark.sql("select product_id, count(*) as counts from df group by product_id order by counts desc").show(5)

+----------+------+
|product_id|counts|
+----------+------+
|B006ZP8UOW|  4654|
|B00007E7JU|  4399|
|B0039BPG1A|  3619|
|B002VPE1WK|  3565|
|B0050R67U0|  3177|
+----------+------+
only showing top 5 rows



In [33]:
spark.sql("select product_id, count(*) as counts from df group by product_id order by counts desc").show(5)


+----------+------+
|product_id|counts|
+----------+------+
|B006ZP8UOW|  4654|
|B00007E7JU|  4399|
|B0039BPG1A|  3619|
|B002VPE1WK|  3565|
|B0050R67U0|  3177|
+----------+------+
only showing top 5 rows

