# Generator for dataset with two classes
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [1]:
from pyspark.sql import SparkSession
from utility_functions import *

In [2]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# Define paths
path_cleaned_unknown = directory + "/data/datasets/dataset-cleaned-no-unknown.json"
path_two_classes = directory + "/data/datasets/dataset-two-classes.json"
path_two_classes_directory = directory + "/data/datasets/dataset-two-classes"

In [3]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g") \
    .config("spark.executor.heartbeatInterval", "1200s") \
    .config("spark.executor.extraJavaOptions", "-Xmx32g -Xms12g") \
    .getOrCreate()

In [4]:
# Read dataset
dataset = spark.read.json(path_cleaned_unknown, schema="overall float, reviewText string")

# Split rdd into multiple rdds
split_rdds = dataset.randomSplit([0.1 for _ in range(0, 10)])

In [5]:
# Create rdd of cleaned text
rdds = []

for df in split_rdds:
    rdds.append(
        df.rdd.filter(lambda obj: obj["overall"] != 3.)
        .map(lambda obj: {"overall": 0. if obj["overall"] <= 2. else 1., "reviewText": obj["reviewText"]})
    )
# Save cleaned dataset with unknown words
for i in range(0, len(rdds)):
    save_rdd_to_json_file(path_two_classes_directory + "/two-classes" + f"{i}", rdds[i])

# Create dataset from files
merge_files(path_two_classes_directory, path_two_classes)