<div style="font-size:18pt; padding-top:20px; text-align:center"><div><b><span style="font-weight:bold; color:green">Spark</span> and Processing Customer Reviews</b></div>
<div style="font-size:16pt; padding-top:20px;">
    Part 1. Interactive shell with Jupyter</div>
</div><hr>
<div style="text-align:right;">Sergei Yu. Papulin <span style="font-style: italic;font-weight: bold;">(papulin_bmstu@mail.ru)</span></div>

### Contents

1. Word Count
    - Developing application using interactive shell
    - Running as single job
    - Importing external modules
2. Average Rating Calculation
    - Developing application using interactive shell
    - Importing external modules
3. Stopping Spark Context

<p>[OPTIONAL] <b>Environment Setup</b></p>

In [None]:
import os
import sys

os.environ["SPARK_HOME"]="/home/ubuntu/BigData/spark"
os.environ["PYSPARK_PYTHON"]="/home/ubuntu/ML/anaconda3/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"]="/home/ubuntu/ML/anaconda3/bin/python"

spark_home = os.environ.get("SPARK_HOME")
sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

<p>Run Spark Context</p>

In [None]:
import pyspark

If you run on **YARN** cluster:

In [None]:
# conf = pyspark.SparkConf() \
#         .setAppName("reviewJupyterApp") \
#         .setMaster("yarn") \
#         .set("spark.submit.deployMode", "client")

If you run **locally**:

In [None]:
conf = pyspark.SparkConf() \
        .setAppName("reviewJupyterApp") \
        .set("spark.executor.memory", "1g") \
        .set("spark.executor.core", "2") \
        .set("spark.driver.memory", "2g") \
        .setMaster("local[2]")

In [None]:
sc = pyspark.SparkContext(conf=conf)
sc

In [None]:
data = [1,2,3,4,5,6,7,8,9]
rdd_data = sc.parallelize(data)
rdd_data_ = rdd_data.map(lambda x: x+1).map(lambda x: x+1)
rdd_data.collect()

In [None]:
# Stop the context
sc.stop()

Dataset

In [None]:
# Path to a file in HDFS
file_path = "/YOUR_HDFS_PATH/samples_100.json"
output_path = "/YOUR_HDFS_PATH/output"

# Local FS
FILE_PATH = "file:///home/ubuntu/BigData/datasets/samples_100.json"
OUTPUT_DIR_PATH = "file:///home/ubuntu/BigData/datasets/output"

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">1. Word Count</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To Content</a></div>
    </div>
</div>

### Developing application using interactive shell

In [None]:
import json

In [None]:
# Load data
json_reviews_rdd = sc.textFile(FILE_PATH)
json_reviews_rdd.take(2)

In [None]:
# Split a text of reviews into words
words_rdd = json_reviews_rdd\
    .flatMap(lambda row: json.loads(row)["reviewText"].split(" "))
words_rdd.take(5)

In [None]:
# Create pairs (word, 1)
wcount_pair_rdd = words_rdd.map(lambda word: (word, 1))
wcount_pair_rdd.take(5)

In [None]:
# Count words
wcount_rdd = wcount_pair_rdd.reduceByKey(lambda v1, v2: v1 + v2)
wcount_rdd.take(5)

In [None]:
# Sort the RDD by values
wcount_sorted_rdd = wcount_rdd.sortBy(lambda x: -x[1])
wcount_sorted_rdd.take(5)

In [None]:
# Format the output
wcount_out_rdd = wcount_sorted_rdd.map(lambda x: "{}\t{}".format(x[0], x[1]))
wcount_out_rdd.take(5)

In [None]:
# Save the result to HDFS
wcount_out_rdd.coalesce(1).saveAsTextFile(OUTPUT_DIR_PATH)

In [None]:
# Check
wcount_in_rdd = sc.textFile(output_path + "/*")
wcount_in_rdd.take(5)

### Running as single job

In [None]:
# Remove the output directory
# OUTPUT_DIR_PATH_ = OUTPUT_DIR_PATH.replace("file://", "")
# !rm -fR $OUTPUT_DIR_PATH_

In [None]:
# import json

# # Chain of transformations
# wcount_out_rdd = sc.textFile(FILE_PATH) \
#     .flatMap(lambda row: json.loads(row)["reviewText"].split(" ")) \
#     .map(lambda word: (word, 1)) \
#     .reduceByKey(lambda v1, v2: v1 + v2) \
#     .sortBy(lambda x: -x[1]) \
#     .map(lambda x: "{}\t{}".format(x[0], x[1])) \
#     .coalesce(1)

# # Start job
# wcount_out_rdd.saveAsTextFile(OUTPUT_DIR_PATH)

In [None]:
# Chain of transformations
wcount_out_rdd = (
    sc.textFile(FILE_PATH)
    .flatMap(lambda row: json.loads(row)["reviewText"].split(" "))
    .map(lambda word: (word, 1))
    .reduceByKey(lambda v1, v2: v1 + v2)
    .sortBy(lambda x: -x[1])
    .map(lambda x: "{}\t{}".format(x[0], x[1]))
    .coalesce(1)
)

# Start job
wcount_out_rdd.saveAsTextFile(OUTPUT_DIR_PATH)

In [None]:
# Local
!ls $OUTPUT_DIR_PATH_

In [None]:
# HDFS
# !hdfs dfs -ls /YOUR_PATH/data/spark-rdd-intro/output

Using **combineByKey**

In [None]:
def init_value(val):
    return val

def reduce_inside_partition(acc, val):
    return acc + val
    
def reduce_partitions(acc, val):
    return acc + val

wcount_out_rdd = sc.textFile(FILE_PATH) \
    .flatMap(lambda row: json.loads(row)["reviewText"].split(" ")) \
    .map(lambda word: (word, 1)) \
    .combineByKey(init_value, reduce_inside_partition, reduce_partitions) \
    .sortBy(lambda x: -x[1]) \
    .map(lambda x: "{}\t{}".format(x[0], x[1]))

wcount_out_rdd.take(5)

### Importing external modules

In [None]:
def extract_words(items):
    """Parse rows within each partition"""
    import json
    for item in items:
        try:
            for word in json.loads(str(item))["reviewText"].split(" "):
                yield (word, 1)
        except:
            pass


# Check the function
json_reviews = json_reviews_rdd.take(2)
for item in extract_words(json_reviews):
    print(item)    

In [None]:
# Chain of transformations
wcount_out_rdd = sc.textFile(FILE_PATH) \
    .mapPartitions(extract_words) \
    .reduceByKey(lambda v1, v2: v1 + v2) \
    .sortBy(lambda x: -x[1]) \
    .map(lambda x: "{}\t{}".format(x[0], x[1])) 


# Start job
wcount_out_rdd.take(5)

<a name="2"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">2. Average Rating Calculation</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To Content</a></div>
    </div>
</div>

### Developing application using interactive shell

#### Loading initial data

In [None]:
json_reviews_rdd = sc.textFile(FILE_PATH)
json_reviews_rdd.persist()
json_reviews_rdd.take(1)

#### Average ratings for each product

In [None]:
def extract_prod_rating(item):
    try:
        review = json.loads(item)
        return review["asin"], float(review["overall"])
    except:
        return None

# Check the function
single_review = json_reviews_rdd.take(1)[0]
extract_prod_rating(single_review)

In [None]:
prod_rating_rdd = json_reviews_rdd.map(extract_prod_rating)
prod_rating_rdd.take(5)

In [None]:
avg_prod_rating_rdd = prod_rating_rdd \
    .aggregateByKey((0,0), 
                    lambda x, value: (x[0] + value, x[1] + 1), 
                    lambda x, y: (x[0] + y[0], x[1] + y[1])) \
    .mapValues(lambda x: x[0]/x[1])

avg_prod_rating_rdd.collect()

#### Average rating of all products

In [None]:
def extract_rating(item):
    try:
        rating = float(json.loads(item)["overall"])
        return rating
    except:
        return None

In [None]:
prod_rating_rdd = json_reviews_rdd \
    .map(lambda row: extract_rating(row)) \
    .filter(lambda rating: rating is not None)

prod_rating_rdd.take(5)

In [None]:
rating_count = prod_rating_rdd \
    .aggregate((0,0), 
               lambda x, value: (x[0] + value, x[1] + 1),
               lambda x, y: (x[0] + y[0], x[1] + y[1]))

avg_rating = rating_count[0] / rating_count[1]
avg_rating

In [None]:
# Or in this way
rating_sum = prod_rating_rdd.reduce(lambda x, y: x + y)
n = prod_rating_rdd.count()
avg_rating = rating_sum / n
avg_rating

#### Filter items by their ratings

In [None]:
# Global variable
# Note: if you don't use a broadcast variable 
#  this value will be copied to each task.
#  For small data it's acceptable
rating_threshold = 5

# Broadcast
rating_threshold_br = sc.broadcast(rating_threshold)

In [None]:
def filter_by_rating(item):
    try:
        rating = float(json.loads(item)["overall"])
        return rating >= rating_threshold_br.value
    except:
        return False

In [None]:
items_rdd = json_reviews_rdd.filter(filter_by_rating)
items_rdd.count()

In [None]:
items_rdd.take(2)

### Importing external modules

#### Average ratings for each product

In [None]:
def extract_prod_rating_per_partition(items):
    import json
    for item in items:
        try:
            review = json.loads(item)
            yield review["asin"], float(review["overall"])
        except:
            pass


# Check the function
json_reviews = json_reviews_rdd.take(2)
for item in extract_prod_rating_per_partition(json_reviews):
    print(item)

In [None]:
avg_prod_rating_rdd = sc.textFile(FILE_PATH) \
    .mapPartitions(extract_prod_rating_per_partition) \
    .aggregateByKey((0,0), 
                    lambda x, value: (x[0] + value, x[1] + 1), 
                    lambda x, y: (x[0] + y[0], x[1] + y[1])) \
    .mapValues(lambda x: x[0]/x[1])

avg_prod_rating_rdd.take(5)

<a name="3"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:16pt; font-weight:bold">3. Stopping Spark Context</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To Content</a></div>
    </div>
</div>

In [None]:
sc.stop()