### Setup the sample project for running our GloVe implementation

Imports required for running the code. Load the sample data for testing the project.

In [11]:
import string
from src import context_dictionary
from src.prepare_court_data import import_dataframe
from pyspark.sql.functions import udf, explode, monotonically_increasing_id
from pyspark.sql.types import ArrayType, StringType, IntegerType, MapType
from nltk.tokenize import sent_tokenize, word_tokenize

df_opinions_unparsed = spark.read.load('data/wash_state_1000_opinions.parquet')


Then build the lists of words for each sentence in each document. The end result is a list for each document in which each item is a list of words for each sentence. We chose the nested list so we could preserve sentence boundaries when counting the number of times each word appears in the context of another word. The context does not extend beyond the sentence boundary.

In [12]:
token_lists = udf(lambda doc: [
    word_tokenize(                                              # NLTK word tokenizer is smarter (can separate contractions)
        sentence.translate(                                     # translate can change one character into another
            str.maketrans(string.punctuation, ' '*len(string.punctuation))  # a translator that changes punctuation within words
            )
        ) 
    for sentence in sent_tokenize(doc.replace('\n', ' ').strip().lower())],         # bring the documents in divided into sentences
    ArrayType(ArrayType(StringType())))                                     # declare nested array of strings for Spark
df_words = df_opinions_unparsed.withColumn('sents', token_lists('parsed_text'))
#df_words.persist()


Create a vocabulary list to store each unique term from all the documents. This can be used later to map terms to index numbers, but may not be necessary for the current process where the term colocations will be transformed into a graph to improve lookup speeds.

In [13]:
vocab_list = df_words \
        .withColumn('lists', explode('sents')) \
        .withColumn('words', explode('lists')) \
        .select('words') \
        .distinct() \
        .withColumn('id', monotonically_increasing_id())
vocab_list.persist()

DataFrame[words: string, id: bigint]

Here we build the context dictionary for each term. The terms are counted within a predefined window and within sentences.

In [14]:
udf_contexts = udf(lambda doc: context_dictionary.context(doc), MapType(StringType(), MapType(StringType(), IntegerType())))
df_word_dicts = df_words.withColumn('cooccurrence_dicts', udf_contexts('sents'))
df_word_dicts.persist()

context_counts = df_word_dicts \
        .select(explode('cooccurrence_dicts').alias('token', 'context')) \
        .select('token', explode('context').alias('context', 'count')) \
        .groupBy(['token', 'context']) \
        .sum('count')
context_counts.persist()

DataFrame[token: string, context: string, sum(count): bigint]