## Imports

In [4]:
from collections import Counter, defaultdict # You may import more from collections if needed
import matplotlib.pyplot as plt
from nltk.corpus import brown
import numpy as np
import os

## Load corpora

In [5]:
corpus = ' '.join(brown.words())
with open('./brown_100.txt', 'r') as file_:
    mini_corpus = file_.read()

## Define PMI/PPMI Calculator

In [21]:
class PMICalculator:
    def __init__(self, corpus: str, ppmi: bool = False):
        self.corpus = corpus
        self.words = corpus.split()
        self.word_counts = Counter(self.words)
        self.word_pairs = defaultdict(int)
        self.total_words = len(self.words)
        self.ppmi = ppmi
        self.pmis = {}

    def calculate_word_pairs(self):
        """
        Formulate the word pairs in the corpus and store them in the self.word_pairs dictionary.
        Also counts the number of times each word appears in the corpus.
        """
        i = 0
        while i < len(self.words) - 1:
            # lowercase the words
            word1 = self.words[i].lower()
            word2 = self.words[i + 1].lower()

            # only consider words that appear at least 10 times
            if self.word_counts[word1] < 10 or self.word_counts[word2] < 10:
                i += 1
                continue

            # skip if the word is a period and the next word is not the start token
            if word2 == '.' and self.words[i + 2] != "</s>":
                i += 2
                continue

            # if the end of the pair is an end token, count the pair but move on to the next line 
            elif word2 == "</s>":
                self.word_pairs[(word1, word2)] += 1
                i += 2
                continue

            # count the pair
            self.word_pairs[(word1, word2)] += 1
            i += 1

    
    def calculate_pmi(self):
        """
        Calculates the PMI for each word pair in the corpus
        """
        for word1, word2 in self.word_pairs:
            count_word1 = self.word_counts[word1]
            count_word2 = self.word_counts[word2]
            count_word1_word2 = self.word_pairs[(word1, word2)]
            
            try:
                pmi = np.log2(count_word1_word2 * self.total_words / (count_word1 * count_word2))
            except Exception as e:
                print(word1, word2)
                print(count_word1, count_word2, count_word1_word2)
                print(self.total_words)
                print(e)
                return 0.0
            
            # if we want PPMI, set PMI to 0 if it's negative
            if self.ppmi:
                pmi = max(0, pmi)
            self.pmis[(word1, word2)] = pmi
        return self.pmis
    
    def sorted_pmi_pairs(self, top_n = 10, reverse = True):
        """
        Sorts the PMI pairs by PMI value. Returns the top N pairs if reverse is True, or the bottom N pairs if reverse is False
        """
        pairs = list(self.pmis.items())
        pairs.sort(key=lambda x: x[1], reverse=reverse)
        return pairs[:top_n]

## Use PMI Calculator

### Calculate PMIs for full corpus

In [None]:
pmi_calculator = PMICalculator(corpus)
pmi_calculator.calculate_word_pairs()
pmi_calculator.calculate_pmi()

#### Top 20 PMIs + Notes

In [None]:
top_pmi_pairs = pmi_calculator.sorted_pmi_pairs(20, reverse=True)
for pair, pmi in top_pmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Notes:
- Pairs with highest PMI are mostly proper nouns or uncommon adjective/word pairs.
- They are words that don't often appear by themselves
- Some pairs like 'fake thearapeutic' and 'antenna beam' have the same PMI, indicating they occur with the same ratio of together/apartness

#### Bottom 20 PMIs + Notes

In [None]:
bottom_pmi_pairs = pmi_calculator.sorted_pmi_pairs(20, reverse=False)
for pair, pmi in bottom_pmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Observations:
- PMIs for the bottom pairs are all negative
- they include words that are very common but do not go often together
- 'of on' and 'on of' are two variations of the same permuation that have the same PMI

### Calculate PMIs for mini corpus

In [None]:
pmi_mini_corpus_calculator = PMICalculator(mini_corpus)
pmi_mini_corpus_calculator.calculate_word_pairs()
pmi_mini_corpus_calculator.calculate_pmi()

#### Top 20 PMI pairs for mini corpus

In [None]:
top_pmi_pairs = pmi_mini_corpus_calculator.sorted_pmi_pairs(20, reverse=True)
for pair, pmi in top_pmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Observations:
- contrary to the full corpus, the top PMI pairs for the mini corpus don't contain any proper nouns.
- the top pairs seem to just be pairs that occur frequently together, not a discernable characteristic

#### Bottom 20 PMI pairs for mini corpus

In [None]:
bottom_pmi_pairs = pmi_mini_corpus_calculator.sorted_pmi_pairs(20, reverse=False)
for pair, pmi in bottom_pmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Observations:
- while still uncommon paris, the bottom PMIs for the mini corpus are not so strange as those for the full corpus

### Calculate PPMIs for full corpus

In [None]:
ppmi_full_corpus_calculator = PMICalculator(corpus, ppmi=True)
ppmi_full_corpus_calculator.calculate_word_pairs()
ppmi_full_corpus_calculator.calculate_pmi()

#### Top PPMI pairs for full corpus

In [None]:
top_ppmi_pairs = ppmi_full_corpus_calculator.sorted_pmi_pairs(20, reverse=True)
for pair, pmi in top_ppmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Observations:
- the top PPMI pairs look exactly the same as the PMI pairs

In [None]:
bottom_ppmi_pairs = ppmi_full_corpus_calculator.sorted_pmi_pairs(20, reverse=False)
for pair, pmi in bottom_ppmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Observations:
- the scores of the bottom pairs are indistinguishable from each other, they are all 0

### Calculate PPMIs for mini corpus

In [None]:
ppmi_mini_corpus_calculator = PMICalculator(mini_corpus, ppmi=True)
ppmi_mini_corpus_calculator.calculate_word_pairs()
ppmi_mini_corpus_calculator.calculate_pmi()

In [None]:
top_ppmi_pairs = ppmi_mini_corpus_calculator.sorted_pmi_pairs(20, reverse=True)
for pair, pmi in top_ppmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Observations
- the top PPMIs are different than from the full corpus, and the PMI scores are smaller
- none of the top pairs are proper nouns as seen in the full corpus

In [None]:
bottom_ppmi_pairs = ppmi_mini_corpus_calculator.sorted_pmi_pairs(20, reverse=False)
for pair, pmi in bottom_ppmi_pairs:
    print("Pair: {} PMI: {}".format(pair, pmi))

##### Observations
- Not all of the bottom pairs have PPMI values of 0, as seen in the full corpus
- The bottom pairs did not change much.
- While not super common, none of the bottom pairs in the mini corpus are super uncommon either