In [74]:
import numpy as np
import nltk
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
import pyspark.sql.functions as f
from pyspark.sql.types import BooleanType,IntegerType

VBox()

In [26]:
#.config("spark.num.executors","4") \
#.config("spark.executor.cores","4") \
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .config("spark.default.parallelism","20") \
    .config("spark.executor.memory","16g") \
    .config("spark.driver.memory","16g") \
    .appName("Spark Text Encoder example") \
    .getOrCreate()

VBox()

In [27]:
#rev_data = "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Music_v1_00.tsv.gz"
rev_data = "s3://amazon-reviews-pds/tsv/sample_fr.tsv"
revs = spark.read.csv(rev_data,header=True,sep='\t')

VBox()

In [4]:
revs.printSchema()
revs = revs.repartition(40)

VBox()

root
 |-- marketplace: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- product_parent: string (nullable = true)
 |-- product_title: string (nullable = true)
 |-- product_category: string (nullable = true)
 |-- star_rating: string (nullable = true)
 |-- helpful_votes: string (nullable = true)
 |-- total_votes: string (nullable = true)
 |-- vine: string (nullable = true)
 |-- verified_purchase: string (nullable = true)
 |-- review_headline: string (nullable = true)
 |-- review_body: string (nullable = true)
 |-- review_date: string (nullable = true)

In [12]:
#find the row needs to remove
rm_products = revs.groupby('product_id').count().filter("count < 2").select(revs.product_id).collect()
rm_products_list = [row.product_id.encode('ascii','ignore') for row in rm_products]
del rm_products
rm_users = revs.groupby('customer_id').count().filter("count < 1").select(revs.customer_id).collect()
rm_users_list = [row.customer_id.encode('ascii','ignore') for row in rm_users]
del rm_users

VBox()

In [24]:
#define udf for returning the number of the sentences that is no less than two
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
valid_sentences = lambda paragraph: (paragraph != None) and (len(tokenizer.tokenize(paragraph)) >= 2)
udf_sentences = udf(valid_sentences,BooleanType())

VBox()

In [19]:
#define udf for removinng reviews from products with less than median number of reviews received
valid_products = lambda product_id: (product_id not in rm_products_list)
udf_products = udf(valid_products,BooleanType())

VBox()

In [20]:
#define udf for removinng reviews published by users with less than median number of reviews published
valid_users = lambda customer_id: (customer_id not in rm_users_list)
udf_users = udf(valid_users,BooleanType())

VBox()

In [28]:
#filter the dataset
revs = revs.filter(udf_sentences(revs.review_body)) \
            .filter(udf_products(revs.product_id)) \
            .filter(udf_users(revs.customer_id))

VBox()

In [61]:
#define the udf to find the number of the sentences published for each review
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
num_sentences = lambda paragraph: len(tokenizer.tokenize(paragraph)) if (paragraph != None) else 0
udf_sentences_num = udf(num_sentences,IntegerType())

VBox()

In [64]:
#define udf for finding the median for the given list
median_sentences = lambda val_list : int(np.median(val_list))
udf_median_sentences = udf(median_sentences,IntegerType())

VBox()

In [72]:
#top 10 users ranked by median number of sentences in the reviews they have pub-lished;
revs.groupBy('customer_id') \
    .agg(f.collect_list(udf_sentences_num(revs.review_body)).alias('val_list')) \
    .withColumn('median',udf_median_sentences('val_list')) \
    .sort('median', ascending=False) \
    .limit(10) \
    .show()

VBox()

+-----------+---------+------+
|customer_id| val_list|median|
+-----------+---------+------+
|      29019|      [6]|     6|
|      26552|   [3, 9]|     6|
|      28700|      [4]|     4|
|      31333|[5, 3, 2]|     3|
|      28143|      [3]|     3|
|      17564|      [3]|     3|
|      32494|      [3]|     3|
|      31588|      [2]|     2|
|      28692|      [2]|     2|
|      24844|      [2]|     2|
+-----------+---------+------+

In [73]:
#top 10 products ranked by median number of sentences in the reviews they have received;
revs.groupBy('product_id') \
    .agg(f.collect_list(udf_sentences_num(revs.review_body)).alias('val_list')) \
    .withColumn('median',udf_median_sentences('val_list')) \
    .sort('median', ascending=False) \
    .limit(10) \
    .show()

VBox()

+----------+--------+------+
|product_id|val_list|median|
+----------+--------+------+
|B00D1VKZ3M|     [9]|     9|
|B000FUM0TE|     [6]|     6|
|0316225932|     [6]|     6|
|B00FK4Y0Z2|     [5]|     5|
|B008U6R9B4|     [5]|     5|
|B00I7VMIC2|     [4]|     4|
|B00GIGGS6A|     [3]|     3|
|B004I85CY4|     [3]|     3|
|B00005LANG|     [3]|     3|
|B00005UMYC|     [3]|     3|
+----------+--------+------+