# Sentiment knowledge miner for polarity computation
Disclaymer: To run this notebook, launch pyspark (command "pyspark --master local[*number of cores*]") from the folder containing the notebook.

In [None]:
from pyspark.sql import SparkSession
import pandas as pd
from utility_functions import *
import os
import enchant

In [None]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "32g") \
    .config("spark.driver.memory", "32g") \
    .config("spark.network.timeout", "1200s") \
    .config("spark.executor.memoryOverhead", "12g")\
    .config("spark.executor.heartbeatInterval", "1200s")\
    .config("spark.executor.extraJavaOptions", "-Xmx32g -Xms12g") \
    .getOrCreate()

In [None]:
# Get directory
directory = os.path.dirname(os.getcwd()).replace("\\", "/")

# GLOBAL VARIABLES
PATH_OCCURRENCES = directory + "/data/sentiment-knowledge/all-words-with-occurrences.csv"
PATH_SEEDS_POSITIVE = directory + "/data/sentiment-knowledge/seeds-positive.txt"
PATH_SEEDS_NEGATIVE = directory + "/data/sentiment-knowledge/seeds-negative.txt"
PATH_PMI = directory + "/data/sentiment-knowledge/pmi.csv"
PATH_CO_OCCURRENCES = directory + "/data/sentiment-knowledge/co-occurrences.csv"
PATH_POLARITY = directory + "/data/sentiment-knowledge/polarity.csv"
CHUNK_SIZE = 100000
PATH_DATASET = directory + "/data/datasets/dataset-cleaned-no-unknown.json"
FIRST_COLUMN_OCC = "word"
SECOND_COLUMN_OCC = "occurrences"
DICTIONARY_US = enchant.Dict("en-US")

In [None]:
# Load dataset
dataset = spark.read.json(PATH_DATASET)

In [None]:
# Get list of words
df_words = pd.read_csv(PATH_OCCURRENCES, keep_default_na=False, dtype={"word": str, "occurrences": int})

# Get seeds and prepare dictionary of seeds for future use
seeds_pos = []
seeds_neg = []
seeds = []
seeds_polarity = {}
with open(PATH_SEEDS_POSITIVE, 'r') as f1, open(PATH_SEEDS_NEGATIVE, 'r') as f2:
    read_lines(f1, seeds_pos)
    read_lines(f2, seeds_neg)

for seed in seeds_pos:
    seeds_polarity[seed] = True

for seed in seeds_neg:
    seeds_polarity[seed] = False

seeds = seeds_pos + seeds_neg

# Create dictionary for fast lookup of seed words and for occurrences. Get also the total number of occurrences
tot_occurrences = 0
occurences_dict = {}
seeds_dict = {}
checked_dict = {}
for row in df_words.itertuples():
    # Increase total number of occurrences
    tot_occurrences += row.occurrences
    # Add word to dictionary of occurrences
    occurences_dict[row.word] = row.occurrences
    # Add word to dictionary of checked
    checked_dict[row.word] = DICTIONARY_US.check(row.word)
    # Add word to dictionary for seeds identification
    if row.word in seeds:
        seeds_dict[row.word] = True
    else:
        seeds_dict[row.word] = False

In [None]:
save_list_to_csv(dataset.rdd.map(lambda obj: tokenize_with_sequences(remove_symbols_before_tokenization(obj["reviewText"])))\
            .flatMap(lambda x: [(pair, 1) for pair in get_pairs_word_seed(x, seeds_dict, occurences_dict, checked_dict)])\
            .reduceByKey(lambda x, y: x + y)\
            .map(lambda pair: (pair[0][0], pair[0][1], pmi(pair[1], occurences_dict[pair[0][0]], occurences_dict[pair[0][1]], tot_occurrences)))\
            .filter(lambda t: t[2] > 0)\
            .sortBy(lambda t: t[2], ascending=False)\
            .collect(), PATH_PMI, ['word1', 'word2', 'pmi'])

In [None]:
# Get csv file and compute polarity of words
df_pmi = pd.read_csv(PATH_PMI)

polarity_dict = {}
# Add seeds to polarity dict
for seed in seeds:
    if seeds_polarity[seed]:
        polarity_dict[seed] = 1000
    else:
        polarity_dict[seed] = -1000

# Add other words
for row in df_pmi.itertuples():
    # Add word to dictionary if not present
    if not row.word2 in polarity_dict:
        polarity_dict[row.word2] = 0
    if seeds_polarity[row.word1]:
        # The seed is positive
        polarity_dict[row.word2] += row.pmi
    else:
        # The seed is negative
        polarity_dict[row.word2] -= row.pmi

save_list_to_csv(sorted(list(polarity_dict.items()), key=lambda x: abs(x[1]), reverse=True), PATH_POLARITY, ["word", "polarity"])