# Dataset extractor
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [None]:
from pyspark.sql import SparkSession
from utility_functions import save_rdd_to_json_file, merge_files
import os

In [None]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# Define paths
path_complete_dataset = directory + "/data/datasets/All_Amazon_Review.json"
path_merged_dataset = directory + "/data/datasets/dataset.json"
path_dataset_directory = directory + "/data/datasets/dataset"
# Define number of examples to take and limit for number of characters.
limit = 1000000
upper_limit_characters = 2300
lower_limit_characters = 650

In [None]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "1200s")\
    .config("spark.executor.extraJavaOptions", "-Xmx32g -Xms12g") \
    .getOrCreate()

In [None]:
# Read dataset
dataset = spark.read.json(path_complete_dataset, schema="overall float, reviewText string")

In [None]:
# Create rdd of items with overall 1
dataset1 = dataset.rdd.filter(lambda obj: obj["overall"] == 1.0 and obj["reviewText"] is not None)\
    .filter(lambda obj: lower_limit_characters <= len(obj["reviewText"]) <= upper_limit_characters)\

# Estimate the total count of elements in the RDD
estimated_count = dataset1.countApprox(timeout=100, confidence=0.95)

# Calculate the fraction based on the desired sample size and estimated count
fraction = min(limit / estimated_count, 1.0)

# Sample items and save file
save_rdd_to_json_file(path_dataset_directory + "/dataset1", dataset1.sample(False, fraction))

In [None]:
# Create rdd of items with overall 2
dataset2 = dataset.rdd.filter(lambda obj: obj["overall"] == 2.0 and obj["reviewText"] is not None)\
    .filter(lambda obj: lower_limit_characters <= len(obj["reviewText"].replace(" ", "")) <= upper_limit_characters)

# Estimate the total count of elements in the RDD
estimated_count = dataset2.countApprox(timeout=100, confidence=0.95)

# Calculate the fraction based on the desired sample size and estimated count
fraction = min(limit / estimated_count, 1.0)

# Sample items and save file
save_rdd_to_json_file(path_dataset_directory + "/dataset2", dataset2.sample(False, fraction))

In [None]:
# Create rdd of items with overall 3
dataset3 = dataset.rdd.filter(lambda obj: obj["overall"] == 3.0 and obj["reviewText"] is not None)\
    .filter(lambda obj: lower_limit_characters <= len(obj["reviewText"].replace(" ", "")) <= upper_limit_characters)

# Estimate the total count of elements in the RDD
estimated_count = dataset3.countApprox(timeout=100, confidence=0.95)

# Calculate the fraction based on the desired sample size and estimated count
fraction = min(limit / estimated_count, 1.0)

# Sample items and save file
save_rdd_to_json_file(path_dataset_directory + "/dataset3", dataset3.sample(False, fraction))

In [None]:
# Create rdd of items with overall 4
dataset4 = dataset.rdd.filter(lambda obj: obj["overall"] == 4.0 and obj["reviewText"] is not None)\
    .filter(lambda obj: lower_limit_characters <= len(obj["reviewText"].replace(" ", "")) <= upper_limit_characters)

# Estimate the total count of elements in the RDD
estimated_count = dataset4.countApprox(timeout=100, confidence=0.95)

# Calculate the fraction based on the desired sample size and estimated count
fraction = min(limit / estimated_count, 1.0)

# Sample items and save file
save_rdd_to_json_file(path_dataset_directory + "/dataset4", dataset4.sample(False, fraction))

In [None]:
# Create rdd of items with overall 5
dataset5 = dataset.rdd.filter(lambda obj: obj["overall"] == 5.0 and obj["reviewText"] is not None)\
    .filter(lambda obj: lower_limit_characters <= len(obj["reviewText"].replace(" ", "")) <= upper_limit_characters)

# Estimate the total count of elements in the RDD
estimated_count = dataset5.countApprox(timeout=100, confidence=0.95)

# Calculate the fraction based on the desired sample size and estimated count
fraction = min(limit / estimated_count, 1.0)

# Sample items and save file
save_rdd_to_json_file(path_dataset_directory + "/dataset5", dataset5.sample(False, fraction))

In [None]:
# Create dataset from files produced by previous cells
merge_files(path_dataset_directory, path_merged_dataset)