In [1]:
from pyspark.sql import SparkSession
import pandas as pd
from utility_functions import *
import os
import enchant

In [2]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "120s")\
    .getOrCreate()

In [3]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# GLOBAL VARIABLES
PATH_OCCURRENCES = directory + "/data/sentiment-knowledge/all-words-with-occurrences.csv"
PATH_SEEDS_POSITIVE = directory + "/data/sentiment-knowledge/seeds-positive.txt"
PATH_SEEDS_NEGATIVE = directory + "/data/sentiment-knowledge/seeds-negative.txt"
PATH_PMI = directory + "/data/sentiment-knowledge/pmi.csv"
PATH_CO_OCCURRENCES = directory + "/data/sentiment-knowledge/co-occurrences.csv"
CHUNK_SIZE = 100000
PATH_DATASET = directory + "/data/datasets/dataset-cleaned-no-unknown.json"
FIRST_COLUMN_OCC = "word"
SECOND_COLUMN_OCC = "occurrences"
DICTIONARY_US = enchant.Dict("en-US")

In [4]:
# Load dataset
dataset = spark.read.json(PATH_DATASET)

In [5]:
# Get list of words
df_words = pd.read_csv(PATH_OCCURRENCES, keep_default_na=False, dtype={"word": str, "occurrences": int})

# Get seeds
seeds = []
with open(PATH_SEEDS_POSITIVE, 'r') as f1, open(PATH_SEEDS_NEGATIVE, 'r') as f2:
    read_lines(f1, seeds)
    read_lines(f2, seeds)

# Create dictionary for fast lookup of seed words and for occurrences. Get also the total number of occurrences
tot_occurrences = 0
occurences_dict = {}
seeds_dict = {}
checked_dict = {}
for row in df_words.itertuples():
    # Increase total number of occurrences
    tot_occurrences += row.occurrences
    # Add word to dictionary of occurrences
    occurences_dict[row.word] = row.occurrences
    # Add word to dictionary of checked
    checked_dict[row.word] = DICTIONARY_US.check(row.word)
    # Add word to dictionary for seeds identification
    if row.word in seeds:
        seeds_dict[row.word] = True
    else:
        seeds_dict[row.word] = False

In [6]:
save_list_to_csv(dataset.rdd.map(lambda obj: tokenize_with_sequences(remove_symbols_before_tokenization(obj["reviewText"])))\
            .flatMap(lambda x: [(pair, 1) for pair in get_pairs_word_seed(x, seeds_dict, occurences_dict, checked_dict)])\
            .reduceByKey(lambda x, y: x + y)\
            .map(lambda pair: (pair[0][0], pair[0][1], pmi(pair[1], occurences_dict[pair[0][0]], occurences_dict[pair[0][1]], tot_occurrences)))\
            .sortBy(lambda pair: pair[2], ascending=False)\
            .collect(), PATH_PMI, ['word1', 'word2', 'pmi'])