In [1]:
import nltk
import re
from collections import Counter
from itertools import islice
import pickle

## test corpus

In [25]:
with open('../data/unigram_counter_test_str.pickle', 'rb') as unigram_file, \
        open('../data/bigram_counter_test_str.pickle', 'rb') as bigram_file, \
        open('../data/trigram_counter_test_str.pickle', 'rb') as trigram_file,\
        open('../data/tagged_unigrams_counter_test_str.pickle', 'rb') as tagged_unigram_file, \
        open('../data/tagged_bigrams_counter_test_str.pickle', 'rb') as tagged_bigram_file, \
        open('../data/tagged_trigrams_counter_test_str.pickle', 'rb') as tagged_trigram_file:
    unigram_counter = pickle.load(unigram_file)
    bigram_counter = pickle.load(bigram_file)
    trigram_counter = pickle.load(trigram_file)    
    tagged_unigram_counter = pickle.load(tagged_unigram_file)
    tagged_bigram_counter = pickle.load(tagged_bigram_file)
    tagged_trigram_counter = pickle.load(tagged_trigram_file)    

# COLLOCATIONS

In [26]:
# JJ - adjective or numeral, JJR  - adjective, JJS - adjective, superlative
adjs = ['JJ','JJR','JJS']
# NN noun, common; NNP - noun, proper, sing.; NNPS - noun, proper, plural; NNS - noun, common, plural
nouns = ['NN','NNS','NNP','NNPS']
for noun in nouns:
    tag_combs += [(adj, noun) for adj in adjs] # A,N
    for adj in adjs:
        for noun_ in nouns:
            tag_combs.append((adj, noun, noun_)) # A, N, N
            tag_combs.append((noun, adj, noun_)) # N, A, N
        tag_combs += [(adj, adj_, noun) for adj_ in adjs] # A, A, N
    for noun_ in nouns:
        tag_combs += [(noun, noun_, noun__) for noun__ in nouns] # N, N, N
        tag_combs += [(noun, 'IN', noun_)] # N, P, N
    tag_combs += [(noun_, noun) for noun_ in nouns] # N, N

In [None]:
def get_most_common(test_func, tagged_ngram_counter, m=20):
    most_common = sorted(test_func, key=test_func.get, reverse=True)[:20]
    print("               ngram              |  count  |   test   ")
    for tagged_ngram in most_common:
        ngram, _ = tagged_ngram
        print(f"{repr(ngram):^34}|{repr(tagged_ngram_counter[tagged_ngram]):^9}|{student_t[tagged_ngram]:^8.1f}")

## T-test

<div style="font-size: 20px">
$$t = \frac{\bar{X} - \mu}{\sqrt{\frac{s^2}{N}}}$$
<br><div style="font-size:15px">
$\mu=p(w1,w2)=p(w1)*p(w2)=\frac{count(w1)}{n_{bigrams}}\frac{count(w2)}{n_{bigrams}}$
<br>$\bar{X}=p(w1,w2)=\frac{count(w1,w2)}{n_{bigrams}}$
<br>$s^2=p(w1,w2)*\left(1-p(w1,w2)\right)\approx p(w1,w2),$
<br>because probability of each bigram is very small $\Rightarrow (1-p) \approx 1$
    
    
$$t = \frac{\frac{count(w1,w2)}{n_{bigrams}} - \frac{count(w1)*count(w2)}{n_{bigrams}^2}}{\sqrt{\frac{count(w1,w2)}{n_{bigrams}^2}}} = \sqrt{count(w1,w2)} - \frac{count(w1)*count(w2)}{n*\sqrt{count(w1,w2)}}$$

In [21]:
import math

In [27]:
filtered_bigram_counter = Counter()
for tagged_bigram, count in tagged_bigram_counter.items():
    _, tags = tagged_bigram
    if tags in tag_combs:
        filtered_bigram_counter[tagged_bigram] = count

In [28]:
student_t = dict()
n_bigrams = sum(bigram_counter.values())
for tagged_bigram, count in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    student_t[tagged_bigram] = math.sqrt(count) 
    student_t[tagged_bigram] -= unigram_counter[bigram[0]] * unigram_counter[bigram[1]] / n_bigrams / math.sqrt(count)

In [None]:
most_common = sorted(student_t, key=student_t.get, reverse=True)[:20]
print("              bigram              |  count  |  t-test  ")
for tagged_bigram in most_common:
    bigram, _ = tagged_bigram
    print(f"{repr(bigram):^34}|{repr(tagged_bigram_counter[tagged_bigram]):^9}|{student_t[tagged_bigram]:^8.1f}")

In [30]:
filtered_trigram_counter = Counter()
for tagged_trigram, count in tagged_trigram_counter.items():
    _, tags = tagged_trigram
    if tags in tag_combs:
        filtered_trigram_counter[tagged_trigram] = count
        
student_t = dict()
n_trigrams = sum(filtered_trigram_counter.values())
for tagged_trigram, count in filtered_trigram_counter.items():
    trigram, _ = tagged_trigram
    student_t[tagged_trigram] = math.sqrt(count) 
    student_t[tagged_trigram] -= unigram_counter[trigram[0]] * unigram_counter[trigram[1]] * unigram_counter[trigram[2]]
    student_t[tagged_trigram] /= (n_trigrams * math.sqrt(count))
    
most_common = sorted(student_t, key=student_t.get, reverse=True)[:20]
print("                        trigram                        |  count  |  t-test  ")
for tagged_trigram in most_common:
    trigram, _ = tagged_trigram
    print(f"{repr(trigram):^54}|{repr(tagged_trigram_counter[tagged_trigram]):^9}|{student_t[tagged_trigram]:^8.1f}")

                        trigram                        |  count  |  t-test  
                   '/ prnewswire /'                   |   11    |  0.0   
                   'da vinci code'                    |   11    |  0.0   
                     '/ top news'                     |    6    |  0.0   
                  '/ mayo institute'                  |    1    |  0.0   
                'un security council'                 |   26    |  0.0   
                   'i doubt things'                   |    1    |  0.0   
              'ip video communications'               |    1    |  0.0   
               'un ambassador h.m.g.s'                |    1    |  0.0   
                     'tv in bed'                      |    1    |  0.0   
                   'h. holder jr.'                    |    5    |  0.0   
                     '% if pres'                      |    1    |  0.0   
                  'ba from heathrow'                  |    1    |  0.0   
               '/ / www.doveawards.

my $\chi^2$ is too slow 

In [45]:
chi_sq = dict()
i, k = 0, len(filtered_bigram_counter)
for tagged_bigram, o11 in filtered_bigram_counter.items():
    print(f'working on {i}/{k} bigram',end='\r')
    i+=1
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    o21, o12, o22 = 0, 0, 0
    for bigram1, count in bigram_counter.items():
        if bigram1.endswith(w2):
            o12 += count
        elif bigram1.startswith(w1):
            o21 += count
        else:
            o22 += count
    chi_sq[tagged_bigram] = n_bigrams * (o11 * o22 - o12 * o21)**2
    chi_sq[tagged_bigram] /= ((o11 + o12)*(o11 + o21)*(o12 + o22)*(o21 + o22))

working on 25/442068 bigram

KeyboardInterrupt: 

In [None]:
most_common = sorted(chi_sq, key=chi_sq.get, reverse=True)[:20]
print("              bigram              |  count  |  chi-sq  ")
for tagged_bigram in most_common:
    bigram, _ = tagged_bigram
    print(f"{repr(bigram):^34}|{repr(tagged_bigram_counter[tagged_bigram]):^9}|{chi_sq[tagged_bigram]:^8.1f}")