# Exploring Amazon Reviews Dataset

## Importing Libs

In [1]:
import pyspark.pandas as ps
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import pyspark.sql.functions as F
from pyspark.sql.types import StructType,StructField, StringType

StatementMeta(synsp03, 18, 2, Finished, Available)

'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.


## Selecting only reviews for automotive products

In [2]:
root = 'abfss://default@stdatalakeakita.dfs.core.windows.net/synapse/workspaces/syn-synfactoreddatathon01-dev'

df = spark.read\
.format('parquet')\
.load(f'{root}/silver/silver_amazon_reviews/**')\
.repartition(2000)

df_metadata_automotive = spark.read\
.format('parquet')\
.load(f'{root}/silver/silver_amazon_metadata_automotive/**')\
.repartition(200)

df_reviews_automotive = df.join(
    df_metadata_automotive,
    on=['asin'],
    how='inner'
)

df_pandas = df.pandas_api()
df_pandas_reviews_automotive = df_reviews_automotive.pandas_api()

StatementMeta(synsp03, 18, 3, Finished, Available)

### This dataset originally have 13MM rows, 3MM after filtering automitve category 

In [3]:
print(df_pandas.shape)
print(df_pandas.columns)

StatementMeta(synsp03, 13, 4, Finished, Available)

(138482350, 12)
Index(['asin', 'review_timestamp', 'overall', 'review_text', 'reviewer_id',
       'reviewer_name', 'summary', 'verified', 'vote', 'style_format',
       'style_color', 'style_size'],
      dtype='object')


In [8]:
print(df_pandas_reviews_automotive.shape)
print(df_pandas_reviews_automotive.columns)

StatementMeta(synsp03, 13, 9, Finished, Available)

(3478964, 26)
Index(['asin', 'review_timestamp', 'overall', 'review_text', 'reviewer_id',
       'reviewer_name', 'summary', 'verified', 'vote', 'style_format',
       'style_color', 'style_size', 'title', 'brand', 'rank', 'category',
       'main_category', '2nd_category', '3rd_category', '4th_category',
       'description', 'feature', 'also_buy', 'also_buy_qty', 'also_view',
       'also_view_qty'],
      dtype='object')


### Adding positive and negative overall flags, this will help on sentiment analysis

In [3]:
df_reviews_automotive = (df_reviews_automotive
    .withColumn('flg_positive_overall', F.when(
        F.col('overall') >= F.lit(3), F.lit(True)
        ).otherwise(F.lit(False))
    )
    .withColumn('flg_negative_overall', F.when(
        F.col('overall') < F.lit(3), F.lit(True)
        ).otherwise(F.lit(False))
    )
    .select(
        F.col('reviewer_id')
        , F.col('asin')
        , F.col('title')
        , F.col('rank')
        , F.col('overall')
        , F.col('flg_positive_overall')
        , F.col('flg_negative_overall')
        , F.col('summary')
        , F.col('review_text')
        , F.col('brand')
        , F.col('2nd_category')
        , F.col('3rd_category')
        , F.col('4th_category')
        , F.col('also_buy')
        , F.col('also_buy_qty')
        , F.col('also_view')
        , F.col('also_view_qty')
    )
)

(df_reviews_automotive
    .write
    .format('delta')
    .mode('overwrite')
    .save(f'{root}/silver/silver_amazon_reviews_automotive')
)

StatementMeta(synsp03, 18, 4, Finished, Available)

In [6]:
df_pandas_reviews_automotive.describe()

StatementMeta(synsp03, 15, 6, Finished, Available)

Unnamed: 0,rank,overall,also_buy_qty,also_view_qty
count,3462231.0,3478964.0,3478964.0,3478964.0
mean,663659.3,4.226817,4.884604,10.7612
std,1077200.0,1.309845,15.78215,19.36393
min,1.0,1.0,1.0,1.0
25%,37133.0,4.0,1.0,1.0
50%,262381.0,5.0,1.0,1.0
75%,791343.0,5.0,1.0,4.0
max,27090790.0,5.0,100.0,60.0
