# **Using Spark DataFrame**

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Customer_Sentiments").getOrCreate()

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.sql.functions import col

In [5]:
# marketplace         STRING,
#   customer_id         STRING,
#   review_id           STRING,
#   product_id          STRING,
#   product_parent      STRING,
#   product_title       STRING,
#   product_category    STRING,
#   star_rating         INT,
#   helpful_votes       INT,
#   total_votes         INT,
#   vine                STRING,
#   verified_purchase   STRING,
#   review_headline     STRING,
#   review_body         STRING,
#   review_date         STRING,
#   sentiment           STRING
customer_sentiments_schema =StructType([
    StructField("marketplace", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("review_id", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_parent", StringType(), True),
    StructField("product_title", StringType(), True),
    StructField("product_category", StringType(), True),
    StructField("star_rating", IntegerType(), True),
    StructField("helpful_votes", IntegerType(), True),
    StructField("total_votes", IntegerType(), True),
    StructField("vine", StringType(), True),
    StructField("verified_purchase", StringType(), True),
    StructField("review_headline", StringType(), True),
    StructField("review_body", StringType(), True),
    StructField("review_date", StringType(), True),
    StructField("sentiment", StringType(), True)
])

In [6]:
customer_sentiments = spark.read.option("header", False).schema(customer_sentiments_schema).parquet("customer_reviews_with_sentiment.parquet")

In [7]:
customer_sentiments.printSchema()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: integer (nullable = true)
 |-- helpful_votes: integer (nullable = true)
 |-- total_votes: integer (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)
 |-- sentiment: string (nullable = true)



In [8]:
customer_sentiments.show(5, truncate=False)

+-----------+-----------+--------------+----------+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------+----------------+-----------+-------------+-----------+----+-----------------+------------------------------------------+----------------------------------------------------------------------------------------------+-----------+---------+
|marketplace|customer_id|review_id     |product_id|product_parent|product_title                                                                                                                                  |product_category|star_rating|helpful_votes|total_votes|vine|verified_purchase|review_headline                           |review_body                                                                                   |review_date|sentiment|
+-----------+-----------+--------------+----------+--------------+------------------------------------

1. Count reviews by sentiments




In [9]:
customer_sentiments.groupBy("sentiment").count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| POSITIVE|61788|
| NEGATIVE|26624|
|    MIXED| 7835|
|  NEUTRAL|  586|
+---------+-----+





2.   Count reviews by star rating and sentiments




In [11]:
customer_sentiments.groupBy(["star_rating" ,"sentiment"]).count().show()

+-----------+---------+-----+
|star_rating|sentiment|count|
+-----------+---------+-----+
|          1| NEGATIVE|18109|
|          3| POSITIVE| 1789|
|          5| POSITIVE|47246|
|          2| POSITIVE|  574|
|          2|  NEUTRAL|   27|
|          5|    MIXED| 1206|
|          4|  NEUTRAL|  105|
|          2|    MIXED|  828|
|          1|  NEUTRAL|  133|
|          5| NEGATIVE|  983|
|          3| NEGATIVE| 2521|
|          4| POSITIVE|11236|
|          4| NEGATIVE|  966|
|          3|    MIXED| 2313|
|          4|    MIXED| 2846|
|          1|    MIXED|  642|
|          5|  NEUTRAL|  228|
|          2| NEGATIVE| 4045|
|          3|  NEUTRAL|   93|
|          1| POSITIVE|  943|
+-----------+---------+-----+





3.   Top 10 highest-rated reviews with text



In [21]:
from pyspark.sql.functions import desc


In [22]:
customer_sentiments \
    .orderBy(desc("star_rating")) \
    .select("star_rating", "review_headline", "review_body") \
    .show(10)


+-----------+--------------------+--------------------+
|star_rating|     review_headline|         review_body|
+-----------+--------------------+--------------------+
|          5| Love it for camping|Wonderful! It spi...|
|          5|Save my electric ...|After watching a ...|
|          5|If you need a new...|What a great stov...|
|          5|          Five Stars|        worked great|
|          5|       Fast Shipping|Part exactly what...|
|          5|Very well satisfied.|Arrived on time a...|
|          5|          Five Stars|No more running t...|
|          5|          Five Stars|               Super|
|          5|          Five Stars|exactly what I wa...|
|          5|       Great Product|My wife is lookin...|
+-----------+--------------------+--------------------+
only showing top 10 rows


# **Using Spark SQL**

In [23]:
customer_sentiments.createOrReplaceTempView("customer_sentiments")


1. Count reviews by sentiments




In [24]:
query1= """
    select count(*) as count, sentiment
    from customer_sentiments
    group by sentiment
    order by count desc
"""

In [25]:
spark.sql(query1).show()

+-----+---------+
|count|sentiment|
+-----+---------+
|61788| POSITIVE|
|26624| NEGATIVE|
| 7835|    MIXED|
|  586|  NEUTRAL|
+-----+---------+





2.   Count reviews by star rating and sentiments




In [26]:
query2= """
    select count(*) as count, star_rating, sentiment
    from customer_sentiments
    group by star_rating, sentiment
"""

In [27]:
spark.sql(query2).show()

+-----+-----------+---------+
|count|star_rating|sentiment|
+-----+-----------+---------+
|18109|          1| NEGATIVE|
| 1789|          3| POSITIVE|
|47246|          5| POSITIVE|
|  574|          2| POSITIVE|
|   27|          2|  NEUTRAL|
| 1206|          5|    MIXED|
|  105|          4|  NEUTRAL|
|  828|          2|    MIXED|
|  133|          1|  NEUTRAL|
|  983|          5| NEGATIVE|
| 2521|          3| NEGATIVE|
|11236|          4| POSITIVE|
|  966|          4| NEGATIVE|
| 2313|          3|    MIXED|
| 2846|          4|    MIXED|
|  642|          1|    MIXED|
|  228|          5|  NEUTRAL|
| 4045|          2| NEGATIVE|
|   93|          3|  NEUTRAL|
|  943|          1| POSITIVE|
+-----+-----------+---------+





3.   Top 10 highest-rated reviews with text



In [28]:
query3= """
    select star_rating, review_headline, review_body
    from customer_sentiments
    order by star_rating desc
"""

In [29]:
spark.sql(query3).show()

+-----------+--------------------+--------------------+
|star_rating|     review_headline|         review_body|
+-----------+--------------------+--------------------+
|          5|perfect fit - wat...|Arrived when prom...|
|          5|If you need a new...|What a great stov...|
|          5|Save my electric ...|After watching a ...|
|          5|          Five Stars|This is so very A...|
|          5|       Fast Shipping|Part exactly what...|
|          5|          Five Stars|It was everything...|
|          5|          Five Stars|No more running t...|
|          5|EXCELLENT lightin...|Bought this for a...|
|          5|          Five Stars|exactly what I wa...|
|          5|          Five Stars|very happy with p...|
|          5|Very well satisfied.|Arrived on time a...|
|          5|          Five Stars|         perfect fit|
|          5|       Great Product|My wife is lookin...|
|          5|          Five Stars|Arrived quickly a...|
|          5|          Five Stars|        worked