<div style="font-size:18pt; padding-top:20px; text-align:center"><b>Introduction to <span style="font-weight:bold; color:green">Spark</span> with Jupyter</b></div><hr>
<div style="text-align:right;">Sergei Yu. Papulin <span style="font-style: italic;font-weight: bold;">(papulin_bmstu@mail.ru)</span></div>

<a name="0"></a>
<div><span style="font-size:14pt; font-weight:bold">Content</span>
    <ol>
        <li><a href="#1">Word Count</a>
        </li>
        <li><a href="#2">Average Rating Calculation</a>
            <ol style = "list-style-type:lower-alpha">
                <li><a href="#2a">Average ratings for each product</a></li>
                <li><a href="#2b">Average rating of all products</a></li>
                <li><a href="#2c">Filter items by their ratings</a></li>
                <li><a href="#2d">Average rating of product</a></li>
            </ol>
        </li>
    </ol>
</div>

<p>[OPTIONAL] <b>Environment Setup</b></p>

In [None]:
import os
import sys

os.environ["SPARK_HOME"]="/opt/cloudera/parcels/SPARK2/lib/spark2"
os.environ["PYSPARK_PYTHON"]="/opt/rh/rh-python36/root/usr/bin/python"
os.environ["PYSPARK_DRIVER_PYTHON"]="/opt/rh/rh-python36/root/usr/bin/python"

spark_home = os.environ.get("SPARK_HOME")
sys.path.insert(0, os.path.join(spark_home, "python"))
sys.path.insert(0, os.path.join(spark_home, "python/lib/py4j-0.10.7-src.zip"))

<p>Run Spark Context</p>

In [None]:
import pyspark

In [None]:
conf = pyspark.SparkConf() \
        .setAppName("jupyterRDDApp") \
        .setMaster("yarn") \
        .set("spark.submit.deployMode", "client")

In [None]:
sc = pyspark.SparkContext(conf=conf)

<a name="1"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">1. Word Count</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To Content</a></div>
    </div>
</div>

In [None]:
import json

In [None]:
file_path = "data/spark_rdd/samples_100.json"
output_path = "data/spark_rdd/word_count/"

In [None]:
# Load data from HDFS
textFile = sc.textFile(file_path)
textFile.take(2)

In [None]:
# Split a text of reviews into words
wordCount_words = textFile.flatMap(lambda row: json.loads(row)["reviewText"].split(" "))
wordCount_words.take(5)

In [None]:
# Create pairs (word, 1)
wordCount_pair = wordCount_words.map(lambda word: (word, 1))
wordCount_pair.take(5)

In [None]:
# Count words
wordCount_count = wordCount_pair.reduceByKey(lambda v1, v2: v1 + v2)
wordCount_count.take(5)

In [None]:
# Sort the RDD by values
wordCount_sorted = wordCount_count.sortBy(lambda x: -x[1])
wordCount_sorted.take(5)

In [None]:
# Save the result to HDFS
wordCount_sorted.saveAsTextFile(output_path)

In [None]:
# Check
wordCount_file = sc.textFile(output_path + "/*")
wordCount_file.take(5)

<a name="2"></a>
<div style="display:table; width:100%; padding-top:10px; padding-bottom:10px; border-bottom:1px solid lightgrey">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-size:14pt; font-weight:bold">2. Average Rating Calculation</div>
    	<div style="display:table-cell; width:20%; text-align:center; background-color:whitesmoke; border:1px solid lightgrey"><a href="#0">To Content</a></div>
    </div>
</div>

<p>RDD</p>

In [None]:
rdd_review_100 = sc.textFile(file_path).persist()
rdd_review_100.take(1)

<p>Dataframe</p>

In [None]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)

In [None]:
df_review_100 = sqlContext.read.json(file_path).persist()
df_review_100.show(2)

<a name="2a"></a>
<div style="display:table; width:100%">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-style:italic; font-weight:bold; font-size:12pt">
            a. Average ratings for each product
        </div>
        <div style="display:table-cell; border:1px solid lightgrey; width:20%">
            <div style="display:table-cell; width:10%; text-align:center; background-color:whitesmoke;">
                <a href="#2">Back</a>
            </div>
            <div style="display:table-cell; width:10%; text-align:center;">
                <a href="#2b">Next</a>
            </div>
        </div>
    </div>
</div>

<p>RDD</p>

In [None]:
def get_prod_rating(review_json_item):
    dict_review_item = json.loads(review_json_item)
    return (dict_review_item["asin"], dict_review_item["overall"])

In [None]:
rdd_prod_rating = rdd_review_100.map(lambda row: get_prod_rating(row))
rdd_prod_rating.take(5)

In [None]:
rdd_avg_prod_rating = rdd_prod_rating.aggregateByKey((0,0), lambda x, value: (x[0] + value, x[1] + 1), 
                               lambda x, y: (x[0] + y[0], x[1] + y[1])).mapValues(lambda x: x[0]/x[1])

rdd_avg_prod_rating.collect()

<a name="2b"></a>
<div style="display:table; width:100%">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-style:italic; font-weight:bold; font-size:12pt">
            b. Average rating of all products
        </div>
        <div style="display:table-cell; border:1px solid lightgrey; width:20%">
            <div style="display:table-cell; width:10%; text-align:center; background-color:whitesmoke;">
                <a href="#2a">Back</a>
            </div>
            <div style="display:table-cell; width:10%; text-align:center;">
                <a href="#2c">Next</a>
            </div>
        </div>
    </div>
</div>

<p>RDD</p>

In [None]:
def get_prod_rating(review_json_item):
    rating = json.loads(review_json_item)["overall"]
    if isinstance(rating, float):
        return rating
    return None

In [None]:
rdd_prod_rating = rdd_review_100.map(lambda row: get_prod_rating(row)).filter(lambda rating: rating is not None)
rdd_prod_rating.take(5)

In [None]:
rating_count = rdd_prod_rating.aggregate((0,0), 
                                       lambda x, value: (x[0] + value, x[1] + 1),
                                       lambda x, y: (x[0] + y[0], x[1] + y[1]))

avg_rating = rating_count[0]/rating_count[1]
avg_rating

<a name="2c"></a>
<div style="display:table; width:100%">
    <div style="display:table-row">
        <div style="display:table-cell; width:80%; font-style:italic; font-weight:bold; font-size:12pt">
            c. Filter items by their ratings
        </div>
        <div style="display:table-cell; border:1px solid lightgrey; width:20%">
            <div style="display:table-cell; width:10%; text-align:center; background-color:whitesmoke;">
                <a href="#2b">Back</a>
            </div>
            <div style="display:table-cell; width:10%; text-align:center;">
                <a href="#2d">Next</a>
            </div>
        </div>
    </div>
</div>

<p>RDD</p>

In [None]:
rating_threshold = 4 # Global variable. It's possible to apply a broadcast variable as well

In [None]:
def filter_by_rating(review_json_item):
    rating = json.loads(review_json_item)["overall"]
    if isinstance(rating, float) and rating >= rating_threshold:
        return True
    return False

In [None]:
rdd_items = rdd_review_100.filter(lambda row: filter_by_rating(row))
rdd_items.count()

In [None]:
rdd_items.take(2)