## (P)PMI

#### The roadmap 

1. load all the relevant data
2. normalize data (tokenizing, puncutation removal, stemming)
3. decide on your context and visualize your a word-context matrix
4. use (P)PMI for smoothing your data
5. Display the word-context matrix with new values

The crucial question is: how do we want to build the word-context matrix? There is a number of possibilities how to approximate the "context". The context can be very large, i.e. the whole document can be considered as a context. The result would rather be a word-document-matrix. A smaller context could contain a word in the context of a sentence or in the context of a defined numbers of words left and/or right of the word in question (the number of co-occuring words is called _window size_). 

Main code reference:
http://www.katrinerk.com/courses/computational-semantics-undergraduate/demo-the-building-blocks-of-a-distributional-model

#### Source data

The articles below are stored in the 'rex' folder.

1. https://www.theguardian.com/science/2007/apr/13/uknews.taxonomy (rex_guardian)

2. http://www.telegraph.co.uk/news/science/science-news/3340709/Chicken-is-T-rexs-closes-living-relative.html (rex_graph)

3. http://www.independent.co.uk/news/science/tyrannosaurus-rex-was-more-like-a-chicken-than-a-crocodile-815417.html (rex_indy)

4. https://www.livescience.com/1410-rex-related-chickens.html (rex_science)

In [None]:
import nltk
import numpy as np
import string
import re
import os

#nltk.download('punkt')   # Punkt Tokenizer Model
#nltk.download('averaged_perceptron_tagger')  # Part-of-Speech Tokeniser
#nltk.download("stopwords") # Stopwords

# modules for tokenization and removal of stopwords
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# modules for stemming
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import SnowballStemmer 

In [None]:
# Replace this directory with one on your own machine

demo_dir = "rex"

#import os

# We iterate over the corpus files.

# os.listdir lists the names of all files in a directory
for filename in os.listdir(demo_dir):
    if filename.endswith("txt"):
        print("reading file", filename)
        text = open(os.path.join(demo_dir, filename)).read()
            

In [None]:
def do_stemming(filtered):
    stemmed = []
    for f in filtered:
        #stemmed.append(PorterStemmer().stem(f))
        #stemmed.append(LancasterStemmer().stem(f))
        stemmed.append(SnowballStemmer('english').stem(f))
    return stemmed

In [None]:
##
# NLTK processing objects

#import nltk

#import string

def preprocess(s):
    # split up into words, lowercase, remove punctuation at beginning and end of word
    remove_punctuation = [ w.lower().strip(string.punctuation) for w in s.split() ] # remove punctuation
    remove_stopwords = [ w for w in remove_punctuation if w.lower() not in stopwords.words('english') ] # remove stopwords
    return do_stemming(remove_stopwords)                        #include stemming


# or like this:
# def preprocess(s):
#     words =  [ ]
#     for w in s.split():
#         word = w.lower()
#         word = word.strip(string.punctuation)
#         words.append(word)
#     return words



# use the function like this:
preprocess("This is a test.")

In [None]:

#####################
# Counting words:
# We want to make a list of the N most frequent words in our corpus

#import os

def do_word_count(demo_dir, numdims):
    # we store the counts in word_count
    # using NLTK's FreqDist
    word_count = nltk.FreqDist()
   
    # We iterate over the corpus files
    for filename in os.listdir(demo_dir):
        if filename.endswith("txt"):
            print("reading file", filename)
            text = open(os.path.join(demo_dir, filename)).read()
            word_count.update(preprocess(text))
           
    # keep_wordfreq is a list of (word, frequency) pairs
    keep_wordfreq = word_count.most_common(numdims)
    keep_these_words = [ w for w, freq in keep_wordfreq ]
    # print("Target words:\n", keep_these_words, "\n")
   
    return keep_these_words

# or like this, without FreqDist:
# def do_word_count(demo_dir, numdims):
#     word_count = { }

#     for filename in os.listdir(demo_dir):
#         if filename.endswith("txt"):
#             print("reading file", filename)
#         text = open(os.path.join(demo_dir, filename)).read()
#         for taggedword in preprocess(text):
#             if taggedword not in word_count:
#                 word_count[ taggedword ] = 0
#             word_count[ taggedword ] += 1
#
#     def map_word_to_count(word): return word_count[ word ]
#     keep_these_words = sorted(word_count.keys(), key = map_word_to_count)[:numdims]
#    
#     # print("Target words (and also dimensions):\n", keep_these_words, "\n")
#
#     return keep_these_words



##
# run this:
def test_wordcount():
    print("Doing a frequency-based cutoff: keeping only the N most frequent context words.")
   
    # with 10 dimensions
    keepwords = do_word_count(demo_dir, 10)
    print("Keeping only 10 dimensions, then I get:", keepwords, "\n")

    # with 100 dimensions
    keepwords = do_word_count(demo_dir, 100)
    print("Keeping 100 dimensions, then I get:", keepwords, "\n")

In [None]:
test_wordcount()

In [None]:

###
# identifying context words for a narrow context window of 2 words on either side
# of the target:
# takes as input a sequence of words for counting.
# For each word in the sequence, make 4 pairs:
# (word, left neighbor of word), (word, left neighbor of left neighbor of word),
# (word, right neighbor of word), (word, right neighbor of right neighbor of word),
# so pair each word with all its context items in the context window.
# Return a list of these pairs.
def co_occurrences(wordsequence):
    target_context_pairs = [ ]

    # for a sequence of length N, count from 0 to N-1
    for index in range(len(wordsequence) - 1):
        # count that word[index] as a target co-occurred with the next word as a context item,
        # and vice versa
        target_context_pairs.append( (wordsequence[index], wordsequence[index+1]) )
        target_context_pairs.append( (wordsequence[index+1], wordsequence[index]) )

        if index + 2 < len(wordsequence):
            # there is a word 2 words away
            # count that word[index] as a target co-occurred with the but-next word as a context item,
            # and vice versa
            target_context_pairs.append( (wordsequence[index], wordsequence[index+2]) )
            target_context_pairs.append( (wordsequence[index+2], wordsequence[index]) )

    return target_context_pairs

###
# run this to test co-occurrences
def test_cooccurrences():
    text = """You will not find Dr. Jekyll; he is from home," replied Mr. Hyde"""
    print("Testing the function that pairs up each target word with its context words.")
    print("Original text:", text, "\n")

    words = preprocess(text)
    cooc = co_occurrences(words)
    print("These are the target/context pairs:", cooc, "\n")

In [None]:
test_cooccurrences()

In [None]:

##
# We will need the function make_word_index below.
# It maps each word that we want to keep around as a context item
# to an index, which will be its place in the table of counts,
# that is, its dimension in the space

def make_word_index(keep_these_words):
    # make an index that maps words from 'keep_these_words' to their index
    word_index = { }
    for index, word in enumerate(keep_these_words):
        word_index[ word ] = index

    return word_index

#import numpy #as np

# read all files in demo_dir, and compute a counts vector
# of length numdims for each relevant word.
# The function takes as input also a mapping word_index from relevant words
# to their dimension, from which we derive a set relevant_words.
# This function reads the texts one sentence at a time.
# In each sentence, it identifies context words in the window
# defined by co_occurrences(), and stores them if both the target
# and its context words are relevant_words

def make_space(demo_dir, word_index, numdims):

    # relevant words: those that have an entry in word_index
    relevant_words = set(word_index.keys())

    # space: a mapping from relevant_words to an array of integers (raw counts)
    space = { }
    # fill the space with all zeros.
    for word in relevant_words:
        space[ word ] = np.zeros(numdims, dtype = np.int)

    ##
    # Design decision: We want to take sentence boundaries into account
    # when computing distributional representations.
    # So we need to detect sentence boundaries first.
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')

    # We iterate over the corpus files
    # and count word co-occurrences in a window of 2
    for filename in os.listdir(demo_dir):
        if filename.endswith("txt"):
            print("reading file", filename)
            # read the text
            text = open(os.path.join(demo_dir, filename)).read()
            # split the text into sentences
            sentences = sent_detector.tokenize(text)
            # process one sentence at a time
            for sentence in sentences:
                words = preprocess(sentence)

                # determine pairs of co-occurrences to count,
                # and store them in the matrix
                for target, cxitem in co_occurrences(words):
                    # are these two words relevant?
                    if target in relevant_words and cxitem in relevant_words:
                        # what is the row for this context item?
                        cxitem_index = word_index[ cxitem]
                        # now count
                        space[ target ][cxitem_index] += 1


    return space

###
# run this
def test_space():
    numdims = 10 # change the dimensions at free will!
    # which words to use as targets and context words?
    ktw = do_word_count(demo_dir, numdims)
    # mapping words to an index, which will be their column
    # in the table of counts
    wi = make_word_index(ktw)
    words_in_order = sorted(wi.keys(), key=lambda w:wi[w])
   
    print("word index:")
    for word in words_in_order:
        print(word, wi[word], end= " ")
    print("\n")

    space = make_space(demo_dir, wi, numdims)
   
    print("some words from the space")
    for w in words_in_order[:10]:
        print(w,  space[w], "\n")
        
    return space, words_in_order

In [None]:
test_space()

In [None]:
space, words_in_order = test_space()

import pandas as pd
import operator

word_context_matrix = pd.DataFrame.from_dict(space, orient='index')
word_context_matrix.columns = words_in_order

print(word_context_matrix)
#print(words_in_order)

### Compute (P)PMI

$PMI(w_{1},w_{2}) = log(\frac{P(w_{1},w_{2})}{P(w_{1})P(w_{2})})$

$PPMI(w_{1},w_{2}) = max(log(\frac{P(w_{1},w_{2})}{P(w_{1})P(w_{2})}),0)$

In [None]:
#########
# transform the space using positive(?) pointwise mutual information

# target t, dimension value c, then
# PMI(t, c) = log ( P(t, c) / (P(t) P(c)) )
# where
# P(t, c) = #(t, c) / #(_, _)
# P(t) = #(t, _) / #(_, _)
# P(c) = #(_, c) / #(_, _)
#
# PPMI(t, c) =   PMI(t, c) if PMI(t, c) > 0
#                0 else
def ppmi_transform(space, word_index):
    # #(t, _): for each target word, sum up all its counts.
    # row_sums is a dictionary mapping from target words to row sums
    row_sums = { }
    for word in space.keys():
        row_sums[word] = space[word].sum()

    # #(_, c): for each context word, sum up all its counts
    # This should be the same as #(t, _) because the set of targets
    # is the same as the set of contexts.
    # col_sums is a dictionary mapping from context word indices to column sums
    col_sums = { }
    for index in word_index.values():
        col_sums[ index ] = sum( [ vector[ index ] for vector in space.values() ])

    # sanity check: row sums same as column sums?
    for word in space.keys():
        if row_sums[word] != col_sums[ word_index[word]]:
            print("whoops, failed sanity check for", word, row_sums[word], col_sums[word_index[word]])
   
    # #(_, _): overall count of occurrences. sum of all row_sums
    all_sums = sum(row_sums.values())

    # if all_sums is zero, there's nothing we can do
    # because we then cannot divide by #(_, _)
    if all_sums == 0:
        print("completely empty space, returning it unchanged")
        return space

    # P(t) = #(t, _) / #(_, _)
    p_t = { }
    for word in space.keys():
        p_t[ word ] = row_sums[ word ] / all_sums

    # P(c) = #(_, c) / #(_, _)
    p_c = { }
    for index in col_sums.keys():
        p_c[ index ] = col_sums[ index ] / all_sums

    # ppmi_space: a mapping from words to vectors of values
    ppmi_space = { }
    # first we map from words to values P(t, c)
    for word in space.keys():
        ppmi_space[ word ] = space[ word ] / all_sums
    # divide each entry by P(t)
    for word in space.keys():
        if p_t[ word ] == 0:
            # I haven't seen this word ever, so I cannot
            # divide by P(t). But the whole entry for this word
            # should be 0's, so leave as is.
            pass
        else:
            ppmi_space[ word ] = ppmi_space[ word ] / p_t[ word ]
    # divide each entry by P(c)
    for index in p_c.keys():
        if p_c[ index ] == 0:
            # I haven't seen this context item ever,
            # so I cannot divide by P(c).
            # But every target word will have an entry of 0.0
            # on this column, so nothing more to do.
            pass
        else:
            for word in space.keys():
                ppmi_space[ word ][index] = ppmi_space[ word][index] / p_c[ index ]
               
    # take the logarithm, ignore entries that are zero
    for word in space.keys():
        with np.errstate(divide="ignore",invalid="ignore"):
            ppmi_space[ word ] = np.log(ppmi_space[ word ])
           

    # turn negative numbers to zero
    for word in space.keys():
        ppmi_space[word] = np.maximum(ppmi_space[word], 0.0)

    return ppmi_space

###
# run this:
def test_ppmispace():
    numdims = 10
    # which words to use as targets and context words?
    ktw = do_word_count(demo_dir, numdims)
    # mapping words to an index, which will be their column
    # in the table of counts
    wi = make_word_index(ktw)
    words_in_order = sorted(wi.keys(), key=lambda w:wi[w])
   
    print("word index:")
    for word in words_in_order:
        print(word, wi[word], end=" ")
    print("\n")

    space = make_space(demo_dir, wi, numdims)
    ppmispace = ppmi_transform(space, wi)
    
    print("some raw counts vectors and some ppmi vectors")
    for w in words_in_order[:10]:
        print("---------", "\n", w)
        print("raw", space[w])
        # for the PPMI space, we're rounding to 2 digits after the floating point
        print("ppmi", np.round(ppmispace[w], 2), "\n")
        
    return space, ppmispace

In [None]:
test_ppmispace()

In [None]:
space, ppmispace = test_ppmispace()

PMI_matrix = pd.DataFrame.from_dict(ppmispace, orient='index')
PMI_matrix.columns = words_in_order

print(PMI_matrix)
#print(words_in_order)