In [1]:
import nltk
import re
from collections import Counter
from itertools import islice
import pickle
import math

## test corpus

In [2]:
with open('../data/unigram_counter_test_str.pickle', 'rb') as unigram_file, \
        open('../data/bigram_counter_test_str.pickle', 'rb') as bigram_file, \
        open('../data/trigram_counter_test_str.pickle', 'rb') as trigram_file,\
        open('../data/tagged_unigrams_counter_test_str.pickle', 'rb') as tagged_unigram_file, \
        open('../data/tagged_bigrams_counter_test_str.pickle', 'rb') as tagged_bigram_file, \
        open('../data/tagged_trigrams_counter_test_str.pickle', 'rb') as tagged_trigram_file:
    unigram_counter = pickle.load(unigram_file)
    bigram_counter = pickle.load(bigram_file)
    trigram_counter = pickle.load(trigram_file)    
    tagged_unigram_counter = pickle.load(tagged_unigram_file)
    tagged_bigram_counter = pickle.load(tagged_bigram_file)
    tagged_trigram_counter = pickle.load(tagged_trigram_file)    

# COLLOCATIONS

In [4]:
# JJ - adjective or numeral, JJR  - adjective, JJS - adjective, superlative
adjs = ['JJ','JJR','JJS']
# NN noun, common; NNP - noun, proper, sing.; NNPS - noun, proper, plural; NNS - noun, common, plural
nouns = ['NN','NNS','NNP','NNPS']
tag_combs = []
for noun in nouns:
    tag_combs += [(adj, noun) for adj in adjs] # A,N
    for adj in adjs:
        for noun_ in nouns:
            tag_combs.append((adj, noun, noun_)) # A, N, N
            tag_combs.append((noun, adj, noun_)) # N, A, N
        tag_combs += [(adj, adj_, noun) for adj_ in adjs] # A, A, N
    for noun_ in nouns:
        tag_combs += [(noun, noun_, noun__) for noun__ in nouns] # N, N, N
        tag_combs += [(noun, 'IN', noun_)] # N, P, N
    tag_combs += [(noun_, noun) for noun_ in nouns] # N, N

In [15]:
def get_most_common(test_func, tagged_ngram_counter, m=20):
    most_common = sorted(test_func, key=test_func.get, reverse=True)[:20]
    print("               ngram              |  count  |   test   ")
    for tagged_ngram in most_common:
        ngram, _ = tagged_ngram
        print(f"{repr(ngram):^34}|{repr(tagged_ngram_counter[tagged_ngram]):^9}|{test_func[tagged_ngram]:^8.1f}")

## T-test

<div style="font-size: 20px">
$$t = \frac{\bar{X} - \mu}{\sqrt{\frac{s^2}{N}}}$$
<br><div style="font-size:15px">
$\mu=p(w1,w2)=p(w1)*p(w2)=\frac{count(w1)}{n_{bigrams}}\frac{count(w2)}{n_{bigrams}}$
<br>$\bar{X}=p(w1,w2)=\frac{count(w1,w2)}{n_{bigrams}}$
<br>$s^2=p(w1,w2)*\left(1-p(w1,w2)\right)\approx p(w1,w2),$
<br>because probability of each bigram is very small $\Rightarrow (1-p) \approx 1$
    
    
$$t = \frac{\frac{count(w1,w2)}{n_{bigrams}} - \frac{count(w1)*count(w2)}{n_{bigrams}^2}}{\sqrt{\frac{count(w1,w2)}{n_{bigrams}^2}}} = \sqrt{count(w1,w2)} - \frac{count(w1)*count(w2)}{n*\sqrt{count(w1,w2)}}$$

## BIGRAMS

In [16]:
filtered_bigram_counter = Counter()
for tagged_bigram, count in tagged_bigram_counter.items():
    _, tags = tagged_bigram
    if tags in tag_combs:
        filtered_bigram_counter[tagged_bigram] = count
        
student_t2 = dict()
n_bigrams = sum(bigram_counter.values())
for tagged_bigram, count in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    student_t2[tagged_bigram] = math.sqrt(count) 
    student_t2[tagged_bigram] -= unigram_counter[bigram[0]] * unigram_counter[bigram[1]] / n_bigrams / math.sqrt(count)

get_most_common(student_t2, tagged_bigram_counter)

               ngram              |  count  |   test   
            'new york'            |  3290   |  57.4  
           'last year'            |  2540   |  50.3  
         'united states'          |  2429   |  49.3  
           'last week'            |  1610   |  40.0  
         'prime minister'         |  1322   |  36.4  
           'last month'           |  1054   |  32.4  
          'white house'           |  1029   |  32.1  
           'first time'           |   958   |  30.9  
          'barack obama'          |   801   |  28.1  
        'chief executive'         |   777   |  27.9  
          'health care'           |   774   |  27.8  
           'next year'            |   619   |  24.9  
          'human rights'          |   500   |  22.4  
        'associated press'        |   526   |  22.3  
          'los angeles'           |   486   |  22.0  
         'vice president'         |   464   |  21.5  
         'climate change'         |   409   |  20.2  
          'wall street'   

## TRIGRAMS

In [18]:
filtered_trigram_counter = Counter()
for tagged_trigram, count in tagged_trigram_counter.items():
    _, tags = tagged_trigram
    if tags in tag_combs:
        filtered_trigram_counter[tagged_trigram] = count
        
student_t3 = dict()
n_trigrams = sum(filtered_trigram_counter.values())
for tagged_trigram, count in filtered_trigram_counter.items():
    trigram, _ = tagged_trigram
    student_t3[tagged_trigram] = math.sqrt(count) 
    student_t3[tagged_trigram] -= unigram_counter[trigram[0]] * unigram_counter[trigram[1]] * unigram_counter[trigram[2]]
    student_t3[tagged_trigram] /= (n_trigrams * math.sqrt(count))
    
get_most_common(student_t3, tagged_trigram_counter)

               ngram              |  count  |   test   
         '/ prnewswire /'         |   11    |  0.0   
         'da vinci code'          |   11    |  0.0   
           '/ top news'           |    6    |  0.0   
        '/ mayo institute'        |    1    |  0.0   
      'un security council'       |   26    |  0.0   
         'i doubt things'         |    1    |  0.0   
    'ip video communications'     |    1    |  0.0   
     'un ambassador h.m.g.s'      |    1    |  0.0   
           'tv in bed'            |    1    |  0.0   
         'h. holder jr.'          |    5    |  0.0   
           '% if pres'            |    1    |  0.0   
        'ba from heathrow'        |    1    |  0.0   
     '/ / www.doveawards.com'     |    1    |  0.0   
          'tv for fear'           |    1    |  0.0   
  '/ / www.northropgrumman.com'   |    1    |  0.0   
        'ip media player'         |    2    |  0.0   
    'uk government assistance'    |    1    |  0.0   
        '/ / www.basf.com'

my $\chi^2$ is too slow 

In [None]:
chi_sq = dict()
i, k = 0, len(filtered_bigram_counter)
for tagged_bigram, o11 in filtered_bigram_counter.items():
    print(f'working on {i}/{k} bigram',end='\r')
    i+=1
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    o21, o12, o22 = 0, 0, 0
    for bigram1, count in bigram_counter.items():
        if bigram1.endswith(w2):
            o12 += count
        elif bigram1.startswith(w1):
            o21 += count
        else:
            o22 += count
    chi_sq[tagged_bigram] = n_bigrams * (o11 * o22 - o12 * o21)**2
    chi_sq[tagged_bigram] /= ((o11 + o12)*(o11 + o21)*(o12 + o22)*(o21 + o22))

In [None]:
most_common = sorted(chi_sq, key=chi_sq.get, reverse=True)[:20]
print("              bigram              |  count  |  chi-sq  ")
for tagged_bigram in most_common:
    bigram, _ = tagged_bigram
    print(f"{repr(bigram):^34}|{repr(tagged_bigram_counter[tagged_bigram]):^9}|{chi_sq[tagged_bigram]:^8.1f}")