# Text processing and sentiment analysis of Amazon reviews 
* Automotive category analysis

### Importing libs

In [1]:
import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession
from delta import *
import pyspark.sql.functions as F

In [2]:
load_dotenv()

STORAGE_ACCOUNT_NAME = os.getenv('STORAGE_ACCOUNT_NAME')
STORAGE_ACCOUNT_KEY = os.getenv('STORAGE_ACCOUNT_KEY')

builder = SparkSession.builder\
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.3.4,\
io.delta:delta-core_2.12:2.4.0,\
io.delta:delta-storage:2.4.0")\
        .config(f"fs.azure.account.key.{STORAGE_ACCOUNT_NAME}.dfs.core.windows.net", f"{STORAGE_ACCOUNT_KEY}")\
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")\
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()



In [32]:
root = 'abfss://default@stdatalakeakita.dfs.core.windows.net/synapse/workspaces/syn-synfactoreddatathon01-dev'

df = spark.read\
.format('delta')\
.load(f'{root}/silver/silver_amazon_reviews_automotive')

df.show()

+--------------+----------+--------------------+------+-------+--------------------+--------------------+--------------------+--------------------+--------------+-----------------+------------+------------+--------------------+------------+---------+-------------+
|   reviewer_id|      asin|               title|  rank|overall|flg_positive_overall|flg_negative_overall|             summary|         review_text|         brand|     2nd_category|3rd_category|4th_category|            also_buy|also_buy_qty|also_view|also_view_qty|
+--------------+----------+--------------------+------+-------+--------------------+--------------------+--------------------+--------------------+--------------+-----------------+------------+------------+--------------------+------------+---------+-------------+
|A2TYWZQNOGX2YS|B0001EVUCM|Auto Ventshade 77...|364973|      5|                true|               false|          Vent Shade|Did great for me....|Auto Ventshade|Replacement Parts| Body & Trim|        Body

### Pre-processing review text

#### 1. Removing ponctuation

In [34]:
def clean_text(c):
  c = F.lower(c)
  c = F.regexp_replace(c, "(https?\://)\S+", "") # Remove links
  c = F.regexp_replace(c, "(\\n)|\n|\r|\t", "") # Remove CR, tab, and LR
  c = F.regexp_replace(c, "(?:(?:[0-9]{2}[:\/,]){2}[0-9]{2,4})", "") # Remove dates
  c = F.regexp_replace(c, "@([A-Za-z0-9_]+)", "") # Remove usernames
  c = F.regexp_replace(c, "[0-9]", "") # Remove numbers
  c = F.regexp_replace(c, "\:|\/|\#|\.|\?|\!|\&|\"|\,", "") # Remove symbols
  return c

df = df.withColumn("review_text_process", clean_text(F.col("review_text")))
df = df.persist()
df.select('review_text', 'review_text_process').show(truncate=False)

In [19]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover
df_reviews_automotive = (
    df
    .withColumn('review_text', F.coalesce(F.col('review_text'), F.lit('Null Review')))                    
)
tokenizer = Tokenizer(inputCol="review_text", outputCol="review_text_tokenized")
df_reviews_automotive = tokenizer.transform(df_reviews_automotive)
df_reviews_automotive.show()

+--------------+----------+--------------------+-----+-------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+------------+---------------+--------+------------+---------+-------------+---------------------+
|   reviewer_id|      asin|               title| rank|overall|flg_positive_overall|flg_negative_overall|             summary|         review_text|           brand|        2nd_category|3rd_category|   4th_category|also_buy|also_buy_qty|also_view|also_view_qty|review_text_tokenized|
+--------------+----------+--------------------+-----+-------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+------------+---------------+--------+------------+---------+-------------+---------------------+
|  AGRRGDPFDU53|B00009WC2N|Bulldog RS82-I Do...|19503|      1|               false|                true|worked for 2 days...|4 hour install ti...|Bulldog 

### Removing Stop Words

In [23]:
from pyspark.ml.feature import StopWordsRemover
remover = StopWordsRemover()
remover.setInputCol("review_text_tokenized")
remover.setOutputCol("words")
df_reviews_automotive = remover.transform(df_reviews_automotive)
df_reviews_automotive.show()

+--------------+----------+--------------------+-----+-------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+------------+---------------+--------+------------+---------+-------------+---------------------+--------------------+
|   reviewer_id|      asin|               title| rank|overall|flg_positive_overall|flg_negative_overall|             summary|         review_text|           brand|        2nd_category|3rd_category|   4th_category|also_buy|also_buy_qty|also_view|also_view_qty|review_text_tokenized|               words|
+--------------+----------+--------------------+-----+-------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+------------+---------------+--------+------------+---------+-------------+---------------------+--------------------+
|  AGRRGDPFDU53|B00009WC2N|Bulldog RS82-I Do...|19503|      1|               false|        

In [20]:
df_reviews_automotive.select(F.explode('review_text_tokenized')).distinct().count()

925852

In [26]:
df_reviews_automotive.select(F.explode('words')).distinct().count()

925671

In [31]:
hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=925671)
df_reviews_automotiveTF = hashingTF.transform(df_reviews_automotive)

idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(df_reviews_automotiveTF)
df_reviews_automotiveIDF = idfModel.transform(df_reviews_automotiveTF)

df_reviews_automotiveIDF.select("flg_positive_overall", "words", "raw_features", "features").show(truncate=False)

+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------