In [1]:
import nltk
from nltk.book import *
import numpy as np
import requests
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import treebank

nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('treebank')

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


[nltk_data] Downloading package gutenberg to /Users/pasha/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /Users/pasha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to /Users/pasha/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [2]:
def get_voc_from_text(text_=gutenberg.raw('melville-moby_dick.txt')+gutenberg.raw('shakespeare-macbeth.txt')):
    return [element for element in set([word.lower() for word in word_tokenize(text=text_) if word.isalpha()])]


def tokenize_text_to_paragraph(text_=gutenberg.raw('melville-moby_dick.txt'),*, split_="\r\n\r\n"):
    return text_.split(split_)


def get_tokens_from_text(text_=gutenberg.raw('melville-moby_dick.txt')):
    return [word.lower() for word in word_tokenize(text=text_) if word.isalpha()]

In [6]:
vocabulary = get_voc_from_text()
vocabulary[:10]

['measures',
 'cartload',
 'jaws',
 'jacob',
 'aforethought',
 'infirmity',
 'stammering',
 'brighggians',
 'realize',
 'luxuriant']

In [None]:
moby_dick_paragraphs = tokenize_text_to_paragraph()
moby_dick_paragraphs[4]

In [86]:
def PPMI(freq_w1_w2, freq_w1, freq_w2, total):
    return max(np.log2(freq_w1_w2*total/freq_w1*freq_w2), 0)

In [95]:
def PPMI_matrix(term_term_matrix_):
    matrix_elmnt = np.sum(term_term_matrix_)
    mtrx_len = term_term_matrix_.shape[0]
    
    for idx_row in range(term_term_matrix_.shape[0]):
        for idx_column in range(term_term_matrix_.shape[-1]):
            term_term_matrix_[idx_row][idx_column] = PPMI(
                term_term_matrix_[idx_row][idx_column],
                np.sum(term_term_matrix_[idx_row]),
                sum([term_term_matrix_[idx][idx_column] for idx in range(term_term_matrix_.shape[0]) ]),
                matrix_elmnt
            )
    
    
    return term_term_matrix_

## Implement PPMI weighting with co-occurrence based on the presence within the same paragraph.

In [90]:
def term_to_term_matrix_paragraphs(voc=vocabulary, text=gutenberg.raw('melville-moby_dick.txt')):
    text_paragraphs = tokenize_text_to_paragraph(text)
    text_voc = get_voc_from_text(text)
    
    text_voc_idx = dict([(elmnt, idx) for idx, elmnt in enumerate(text_voc)])
    voc_idx = dict([(elmnt, idx) for idx, elmnt in enumerate(voc)])
    
    
    matrix_term_to_term = np.zeros((len(voc), len(text_voc)))
    
    for paragraph in text_paragraphs:
        tokens = get_tokens_from_text(paragraph)
        
        for idx, token in enumerate(tokens):
            for token_next in tokens[-idx:]:
                try:
                    matrix_term_to_term[voc_idx[token]][text_voc_idx[token_next]] += 1
                except:
                    pass
    
    
    return matrix_term_to_term

In [91]:
term_term_matrix_paragraphsi = term_to_term_matrix_paragraphs()
term_term_matrix_paragraphs

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 19., ...,  0.,  1.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  1., ...,  0., 22.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  1.]])

In [92]:
np.sum(term_term_matrix_paragraphs)

17942941.0

In [None]:
ppmi_matrix_paragraphs = PPMI_matrix(term_term_matrix_paragraphs)
ppmi_matrix_paragraphs

In [None]:
np.sum(ppmi_matrix)

## Implement PPMI weighting with co-occurrence based on a sliding window of neighboring words. Pick some number between 2-10.

In [58]:
def term_to_term_matrix(voc=vocabulary, text=gutenberg.raw('melville-moby_dick.txt'), *, co_occurrence=2):
    text_voc = get_voc_from_text(text)
    

    
    
    text_voc_idx = dict([(elmnt, idx) for idx, elmnt in enumerate(text_voc)])
    voc_idx = dict([(elmnt, idx) for idx, elmnt in enumerate(voc)])
    
    
    matrix_term_to_term = np.zeros((len(voc), len(text_voc)))
    
    tokens = get_tokens_from_text(text)
    for idx, token in enumerate(tokens):
        for token_next in tokens[idx:idx+co_occurrence+1]:
            try:
                matrix_term_to_term[voc_idx[token]][text_voc_idx[token_next]] += 1
            except:
                pass
    
    return matrix_term_to_term

In [61]:
term_term_matrix = term_to_term_matrix()
term_term_matrix

array([[ 3.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0., 29., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  1.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0., 42.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  2.]])

In [62]:
np.sum(term_term_matrix)

630960.0

In [77]:
ppmi_matrix = PPMI(term_term_matrix)
ppmi_matrix

  term_term_matrix_[idx_row][idx_column] = max(np.log2(term_term_matrix_[idx_row][idx_column]/matrix_elmnt),0)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
np.sum(ppmi_matrix)

0.0

## Check how algorithm works using English thesaurus. Pick some 10 words, find synonyms for these, e.g. using https://www.merriam-webster.com/thesaurus. Note that semantic similarity is represented in different shades of orange. Does it match the output of PPMI weighting function? Would be nice if you could also draw a table with shaded cells matching closeness given by PPMI.

In [100]:
voc_text_moby_dick = dict([(elmnt, idx) for idx, elmnt in enumerate( get_voc_from_text())])
voc_text_moby_dick

{'measures': 0,
 'cartload': 1,
 'jaws': 2,
 'jacob': 3,
 'aforethought': 4,
 'infirmity': 5,
 'stammering': 6,
 'brighggians': 7,
 'realize': 8,
 'luxuriant': 9,
 'weale': 10,
 'expeditions': 11,
 'spels': 12,
 'parade': 13,
 'tugging': 14,
 'billows': 15,
 'migratory': 16,
 'fatness': 17,
 'enormous': 18,
 'bookbinder': 19,
 'formation': 20,
 'conuinces': 21,
 'genealogies': 22,
 'battle': 23,
 'unconcluded': 24,
 'groove': 25,
 'attended': 26,
 'schooner': 27,
 'leathern': 28,
 'length': 29,
 'which': 30,
 'sounds': 31,
 'lashings': 32,
 'dented': 33,
 'tertius': 34,
 'pig': 35,
 'leaps': 36,
 'shell': 37,
 'nap': 38,
 'solely': 39,
 'growlands': 40,
 'doubloon': 41,
 'queequeg': 42,
 'clearer': 43,
 'second': 44,
 'fang': 45,
 'porter': 46,
 'nasty': 47,
 'neerely': 48,
 'abed': 49,
 'composed': 50,
 'dissertations': 51,
 'whom': 52,
 'hats': 53,
 'verily': 54,
 'nets': 55,
 'confidential': 56,
 'distinguish': 57,
 'dutch': 58,
 'incurable': 59,
 'mode': 60,
 'for': 61,
 'ague': 62

In [120]:
print(f"beautiful - {voc_text_moby_dick['beautiful']}")
print(f"lovely - {voc_text_moby_dick['lovely']}, handsome - {voc_text_moby_dick['handsome']}")
print(f"perfect - {voc_text_moby_dick['perfect']}, goodly - {voc_text_moby_dick['goodly']}, superb - {voc_text_moby_dick['superb']}")
print(f"desirable - {voc_text_moby_dick['desirable']}, sublime - {voc_text_moby_dick['sublime']}, striking - {voc_text_moby_dick['striking']}")

beautiful - 11027
lovely - 9429, handsome - 4591
perfect - 15700, goodly - 7644, superb - 12722
desirable - 14871, sublime - 4987, striking - 15289


In [131]:
term_term_matrix_paragraphs[11027]

array([0., 0., 0., ..., 0., 0., 0.])

In [169]:
def get_semantic_similarity(term_term_matrix_paragraphs=term_term_matrix_paragraphs, idx_1=11027, idx_2=9429):
    arr = np.array([term_term_matrix_paragraphs[idx][idx_2] for idx in range(term_term_matrix_paragraphs.shape[0])])
    new_arr = arr[~np.isnan(arr)]
    
    return PPMI(
    freq_w1_w2=term_term_matrix_paragraphs[idx_1][idx_2], 
    freq_w1=np.sum(term_term_matrix_paragraphs[idx_1]), 
    freq_w2=np.sum(new_arr),
    total=np.sum(term_term_matrix_paragraphs[~np.isnan(term_term_matrix_paragraphs)]))


beautiful and lovely - nan


In [174]:
print(f'beautiful and lovely - {get_semantic_similarity()}')
print(f'beautiful and handsome - {get_semantic_similarity(idx_2=4591)}')
print(f'beautiful and perfect - {get_semantic_similarity(idx_2=15700)}')
print(f'beautiful and goodly - {get_semantic_similarity(idx_2=7644)}')
print(f'beautiful and superb - {get_semantic_similarity(idx_2=12722)}')
print(f'beautiful and desirable - {get_semantic_similarity(idx_2=14871)}')
print(f'beautiful and sublime - {get_semantic_similarity(idx_2=4987)}')
print(f'beautiful and striking - {get_semantic_similarity(idx_2=15289)}')

  return max(np.log2(freq_w1_w2*total/freq_w1*freq_w2), 0)


beautiful and lovely - 0
beautiful and handsome - 25.035092070786188
beautiful and perfect - 0
beautiful and goodly - 0
beautiful and superb - 0
beautiful and desirable - 0
beautiful and sublime - 0
beautiful and striking - 0
