# Data cleaner
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [1]:
from pyspark.sql import SparkSession
from utility_functions import *
from flashtext import KeywordProcessor

In [2]:
# Create spark session
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g") \
    .config("spark.executor.heartbeatInterval", "120s") \
    .getOrCreate()

In [3]:
# Load dataset
path = "data/dataset.jsonl"
dataset = spark.read.json(path)

# Split rdd into multiple rdds
split_rdds = dataset.randomSplit([0.1 for _ in range(0,10)])

In [4]:
# Create rdd of cleaned text
cleaned_rdds = []

for df in split_rdds:
    cleaned_rdds.append(
        df.rdd.map(lambda obj: {"overall": obj["overall"], "reviewText": obj["reviewText"].lower()})
            .filter(lambda obj: "old review" not in obj["reviewText"]) # Remove objects containing "old review"
            .map(lambda obj: {"overall": obj["overall"], "reviewText": cleaning_function_no_unknown(obj["reviewText"])}) # Clean
    )

In [5]:
# Save cleaned dataset with unknown words
path_cleaned = "data/dataset-cleaned.jsonl"

with open(path_cleaned, "w") as f:
    for rdd in cleaned_rdds:
        for item in rdd.collect():
            f.write(str(item) + "\n")

In [6]:
# Load cleaned dataset
dataset_cleaned = spark.read.json(path_cleaned)

# Split rdd into multiple rdds
split_rdds_cleaned = dataset_cleaned.randomSplit([0.1 for _ in range(0,10)])

In [7]:
# Compute list of words with number of occurrences
words_occurrences_rdds = []

for df in split_rdds_cleaned:
    words_occurrences_rdds.append(
        df.rdd.flatMap(lambda obj: [(word, 1) for word in tokenize_with_sequences(remove_symbols_before_tokenization(obj["reviewText"]))]) # Tokenize
            .reduceByKey(lambda x, y: x + y) # Add occurrences
    )

In [8]:
# Merge rdds, then reduce by key to obtain final vocabulary with number of occurreces
merged_rdd_occurrences = words_occurrences_rdds[0].union(words_occurrences_rdds[1])

for i in range(2, len(words_occurrences_rdds)):
    merged_rdd_occurrences = merged_rdd_occurrences.union(words_occurrences_rdds[i])

words_with_occurrences = merged_rdd_occurrences.reduceByKey(lambda x, y: x + y).collect()

In [9]:
# Process list of words with occurrences
count = 0
frequent_words = []
unknown_words = []
for word in sorted(words_with_occurrences, key=lambda x: x[1]):
    if word[1] <= 10:
        count += word[1]
        unknown_words.append(word[0])
    else:
        frequent_words.append(word)

# Append unknown
frequent_words.append(('[UNKNOWN]', count))

# Create keyword processor for later use
kp = KeywordProcessor()
for word in unknown_words:
    kp.add_keyword(word[0], '[UNKNOWN]')

# Save dataframe with occurrences of known words for later use
path_all_occurrences = "data/sentiment-knowledge/all-words-with-occurrences.csv"
save_list_to_csv(sorted(frequent_words, key=lambda x: x[1]), path_all_occurrences, ["word", "occurrences"])

In [10]:
# Create rdd of cleaned text without unknown words
cleaned_no_unknown_rdds = []

for df in split_rdds_cleaned:
    cleaned_no_unknown_rdds.append(
        df.rdd.map(lambda obj: {"overall": obj["overall"], "reviewText": kp.replace_keywords(obj["reviewText"])}) # Remove unknown
    )

In [11]:
# Save cleaned dataset without unknown
path_cleaned_unknown = "data/dataset-cleaned-no-unknown.jsonl"

with open(path_cleaned_unknown, "w") as f:
    for rdd in cleaned_no_unknown_rdds:
        for item in rdd.collect():
            f.write(str(item) + "\n")

In [12]:
count_frequent = 0

for word in frequent_words:
    count_frequent += word[1]

print(f"The percentage of retained words is {(count_frequent * 100)/(count_frequent + count)}")

The percentage of retained words is 99.62825188302014
