In [46]:
import nltk
import re
from collections import Counter
from itertools import islice
import pickle

## test corpus

In [117]:
with open('../data/unigram_counter_test.pickle', 'rb') as unigram_file, \
        open('../data/bigram_counter_test.pickle', 'rb') as bigram_file, \
        open('../data/trigram_counter_test.pickle', 'rb') as trigram_file,\
        open('../data/tagged_unigrams_counter_test.pickle', 'rb') as tagged_unigram_file, \
        open('../data/tagged_bigrams_counter_test.pickle', 'rb') as tagged_bigram_file, \
        open('../data/tagged_trigrams_counter_test.pickle', 'rb') as tagged_trigram_file:
    unigram_counter = pickle.load(unigram_file)
    bigram_counter = pickle.load(bigram_file)
    trigram_counter = pickle.load(trigram_file)    
    tagged_unigram_counter = pickle.load(tagged_unigram_file)
    tagged_bigram_counter = pickle.load(tagged_bigram_file)
    tagged_trigram_counter = pickle.load(tagged_trigram_file)    

# COLLOCATIONS

## T-test

<div style="font-size: 20px">
$$t = \frac{\bar{X} - \mu}{\sqrt{\frac{s^2}{N}}}$$
<br><div style="font-size:15px">
$\mu=p(w1,w2)=p(w1)*p(w2)=\frac{count(w1)}{n_{bigrams}}\frac{count(w2)}{n_{bigrams}}$
<br>$\bar{X}=p(w1,w2)=\frac{count(w1,w2)}{n_{bigrams}}$
<br>$s^2=p(w1,w2)*\left(1-p(w1,w2)\right)\approx p(w1,w2),$
<br>because probability of each bigram is very small $\Rightarrow (1-p) \approx 1$
    
    
$$t = \frac{\frac{count(w1,w2)}{n_{bigrams}} - \frac{count(w1)*count(w2)}{n_{bigrams}^2}}{\sqrt{\frac{count(w1,w2)}{n_{bigrams}^2}}} = \sqrt{count(w1,w2)} - \frac{count(w1)*count(w2)}{n*\sqrt{count(w1,w2)}}$$

In [4]:
import math

In [58]:
# JJ - adjective or numeral, JJR  - adjective, JJS - adjective, superlative
adjs = ['JJ','JJR','JJS']
# NN noun, common; NNP - noun, proper, sing.; NNPS - noun, proper, plural; NNS - noun, common, plural
nouns = ['NN','NNS','NNP','NNPS']
for noun in nouns:
    tag_combs = [(adj, noun) for adj in adjs] # A,N
    for adj in adjs:
        for noun_ in nouns:
            tag_combs.append((adj, noun, noun_)) # A, N, N
            tag_combs.append((noun, adj, noun_)) # N, A, N
        tag_combs += [(adj, adj_, noun) for adj_ in adjs] # A, A, N
    for noun_ in nouns:
        tag_combs += [(noun, noun_, noun__) for noun__ in nouns] # N, N, N
        tag_combs += [(noun, 'IN', noun_)] # N, P, N
    tag_combs += [(noun_, noun) for noun_ in nouns] # N, N

In [112]:
filtered_bigram_counter = Counter()
for tagged_bigram, count in tagged_bigram_counter.items():
    _, tags = zip(*tagged_bigram)
    if tags in tag_combs:
        filtered_bigram_counter[tagged_bigram] = count

In [113]:
student_t = dict()
n_bigrams = sum(filtered_bigram_counter.values())
for tagged_bigram, count in filtered_bigram_counter.items():
    bigram, _ = zip(*tagged_bigram)
    student_t[tagged_bigram] = math.sqrt(count) 
    student_t[tagged_bigram] -= unigram_counter[bigram[0]] * unigram_counter[bigram[1]] / n_bigrams / math.sqrt(count)

In [114]:
most_common = sorted(student_t, key=student_t.get, reverse=True)[:20]
print("              bigram              |  count  |  t-test  ")
for tagged_bigram in most_common:
    bigram, _ = zip(*tagged_bigram)
    print(f"{repr(bigram):^34}|{repr(tagged_bigram_counter[tagged_bigram]):^9}|{student_t[tagged_bigram]:^8.1f}")

              bigram              |  count  |  t-test  
     ('storied', 'leathers')      |    1    |  0.6   
    ('winless', 'norwegians')     |    1    |  0.1   
    ('eolas', 'technologies')     |    1    |  -0.4  
       ('milton', 'keynes')       |    1    |  -5.4  
      ('fervent', 'indians')      |    1    |  -6.5  
      ('mentor', 'graphics')      |    1    |  -9.8  
     ('sun', 'microsystems')      |    1    | -36.9  
      ('houston', 'texans')       |    1    | -42.4  
     ('glow', 'technologies')     |    1    | -47.9  
        ('fair', 'isaac')         |    1    | -95.9  
  ('rank-and-file', 'democrats')  |    1    | -127.0 
       ('south', 'koreans')       |   10    | -271.4 
      ('general', 'motors')       |   55    | -284.2 
       ('pig', 'democrats')       |    1    | -330.7 
     ('liberal', 'democrats')     |   46    | -375.9 
      ('winter', 'olympics')      |    3    | -482.0 
     ('divide', 'democrats')      |    1    | -516.9 
       ('south', 'koreans'

In [119]:
filtered_trigram_counter = Counter()
for tagged_trigram, count in tagged_trigram_counter.items():
    _, tags = zip(*tagged_trigram)
    if tags in tag_combs:
        filtered_trigram_counter[tagged_trigram] = count
        
student_t = dict()
n_trigrams = sum(filtered_trigram_counter.values())
for tagged_trigram, count in filtered_trigram_counter.items():
    trigram, _ = zip(*tagged_trigram)
    student_t[tagged_trigram] = math.sqrt(count) 
    student_t[tagged_trigram] -= unigram_counter[trigram[0]] * unigram_counter[trigram[1]] / n_trigrams / math.sqrt(count)
    
most_common = sorted(student_t, key=student_t.get, reverse=True)[:20]
print("                        trigram                        |  count  |  t-test  ")
for tagged_trigram in most_common:
    trigram, _ = zip(*tagged_trigram)
    print(f"{repr(trigram):^54}|{repr(tagged_trigram_counter[tagged_trigram]):^9}|{student_t[tagged_trigram]:^8.1f}")

                        trigram                        |  count  |  t-test  
           ('storied', 'leathers', 'inc.')            |    1    |  0.1   
          ('pro-eu', 'liberal', 'democrats')          |    1    |  -9.5  
            ('fervent', 'indians', 'fan')             |    1    | -16.6  
    ('french-speaking', 'christian', 'democrats')     |    1    | -43.9  
       ('centre-left', 'liberal', 'democrats')        |    1    | -62.0  
              ('koreans', 'amy', 'yang')              |    1    | -79.5  
            ('sun', 'microsystems', 'inc')            |    1    | -87.7  
      ('centre-right', 'christian', 'democrats')      |    1    | -106.9 
             ('europeans', 'weren', "'t")             |    1    | -173.6 
        ('detroit-based', 'general', 'motors')        |    1    | -177.1 
        ('center-left', 'social', 'democrats')        |    1    | -193.2 
           ('graphics', 'corp.', 'cadence')           |    1    | -217.0 
              ('fair', 'isaac', 'co