# Seed words finder for PMI score computation
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [None]:
from pyspark.sql import SparkSession
from utility_functions import *

In [None]:
# Create spark session
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "120s")\
    .getOrCreate()

In [None]:
# Load dataset
path = "data\dataset-cleaned-no-unknown.jsonl"
dataset = spark.read.json(path)

# Split rdd into multiple rdds
split_rdds = dataset.randomSplit([0.1 for _ in range(0,10)])

In [None]:
# Compute polarity for words.
words_polarity_rdds = []

for df in split_rdds:
    words_polarity_rdds.append(
        df.rdd.flatMap(lambda obj: [(word, calculate_polarity_of_occurrence(obj["overall"])) for word in tokenize_with_sequences(remove_symbols_before_tokenization(obj["reviewText"]))]) # Get polarity of each occurrence
            .reduceByKey(lambda x, y: x + y) # Add polarity together
    )

In [None]:
# Merge rdds, then reduce by key to obtain final vocabulary
merged_rdd_polarity = words_polarity_rdds[0].union(words_polarity_rdds[1])

for i in range(2, len(words_polarity_rdds)):
    merged_rdd_polarity = merged_rdd_polarity.union(words_polarity_rdds[i])

word_with_polarity = merged_rdd_polarity.reduceByKey(lambda x, y: x + y).collect()

In [None]:
# Save to csv file
path_csv = "data/sentiment-knowledge/words-polarity.csv"
save_list_to_csv(sorted(word_with_polarity, key=lambda x: x[1]), path_csv, ['word', 'polarity'])