In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate() 
spark

In [2]:
spark.sparkContext # RDD API entrypoint

In [3]:
reviews_paths =['data/reviews_fashion.json', 
                'data/reviews_electronics.json',
                'data/reviews_sports.json']

inferred_reviews = spark.read.json(reviews_paths)
inferred_reviews

DataFrame[asin: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerID: string, reviewerName: string, summary: string, unixReviewTime: bigint]

In [4]:
# Initialize a variable and call it "POSIX". The assign it with a string value representing 
# the filepath pattern that will capture all three .json files

POSIX = "data/reviews_*.json" 

In [5]:
inferred_reviews = spark.read.json(POSIX)

In [6]:
inferred_reviews

DataFrame[asin: string, helpful: array<bigint>, overall: double, reviewText: string, reviewTime: string, reviewerID: string, reviewerName: string, summary: string, unixReviewTime: bigint]

In [7]:
# Export the modules
from pyspark.sql.types import *

# Define Schema
REVIEWS_SCHEMA_DEF = StructType([
        StructField('reviewerID', StringType(), True),
        StructField('asin', StringType(), True),
        StructField('reviewerName', StringType(), True),
        StructField('helpful', ArrayType(
                IntegerType(), True), 
            True),
#        review text column definition missing here
        StructField('reviewTime', StringType(), True),
        StructField('overall', DoubleType(), True),
        StructField('summary', StringType(), True),
        StructField('unixReviewTime', LongType(), True)
    ])

print(REVIEWS_SCHEMA_DEF)

StructType(List(StructField(reviewerID,StringType,true),StructField(asin,StringType,true),StructField(reviewerName,StringType,true),StructField(helpful,ArrayType(IntegerType,true),true),StructField(reviewTime,StringType,true),StructField(overall,DoubleType,true),StructField(summary,StringType,true),StructField(unixReviewTime,LongType,true)))


In [8]:
#The reviewText field is not added to the schema here. What do you think will happen if we try to enforce this 
#schema to the reviews dataset?
#Spark will enforce manual schema on the defined columns and ignore others
#Spark will enforce manual schema on the defined columns and infer schema for others
# Try it out
reviews = spark.read.json(POSIX, schema=REVIEWS_SCHEMA_DEF)
print("The answer is {}".format(reviews))

The answer is DataFrame[reviewerID: string, asin: string, reviewerName: string, helpful: array<int>, reviewTime: string, overall: double, summary: string, unixReviewTime: bigint]


In [9]:
reviews.printSchema()

root
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- reviewTime: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [10]:
#Add reviewText field to the data schema (5 mins)
# Define FULL_REVIEWS_SCHEMA_DEF here

FULL_REVIEWS_SCHEMA_DEF = StructType([
        StructField('reviewerID', StringType(), True),
        StructField('asin', StringType(), True),
        StructField('reviewerName', StringType(), True),
        StructField('helpful', ArrayType(
                IntegerType(), True), 
            True),
        StructField('reviewText', StringType(), True),
        StructField('reviewTime', StringType(), True),
        StructField('overall', DoubleType(), True),
        StructField('summary', StringType(), True),
        StructField('unixReviewTime', LongType(), True)
    ])

print(FULL_REVIEWS_SCHEMA_DEF)

StructType(List(StructField(reviewerID,StringType,true),StructField(asin,StringType,true),StructField(reviewerName,StringType,true),StructField(helpful,ArrayType(IntegerType,true),true),StructField(reviewText,StringType,true),StructField(reviewTime,StringType,true),StructField(overall,DoubleType,true),StructField(summary,StringType,true),StructField(unixReviewTime,LongType,true)))


In [11]:
reviews = spark.read.json(POSIX, schema=FULL_REVIEWS_SCHEMA_DEF)
reviews.printSchema()

root
 |-- reviewerID: string (nullable = true)
 |-- asin: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- helpful: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)



In [None]:
#DataFrame operations
#Spark DataFrame API allow you to do multiple operations on the Data. The primary advantage of using the DataFrame API
#is that you can do data transformations with the high level API without having to use Python. Using the high level API 
#has performance advantages.
#  DataFrame API have functionality similar to that of Core RDD API. For example: 
#      map : foreach, Select
#      filter : filter
#      groupByKey, reduceByKey : groupBy
#
##Selecting Columns
#You can use SELECT statement to select columns from your dataframe