In [1]:
import nltk
import re
from collections import Counter
from itertools import islice
import pickle
import math

## test corpus

In [2]:
with open('../data/unigram_counter_test_str.pickle', 'rb') as unigram_file, \
        open('../data/bigram_counter_test_str.pickle', 'rb') as bigram_file, \
        open('../data/trigram_counter_test_str.pickle', 'rb') as trigram_file,\
        open('../data/tagged_unigrams_counter_test_str.pickle', 'rb') as tagged_unigram_file, \
        open('../data/tagged_bigrams_counter_test_str.pickle', 'rb') as tagged_bigram_file, \
        open('../data/tagged_trigrams_counter_test_str.pickle', 'rb') as tagged_trigram_file:
    unigram_counter = pickle.load(unigram_file)
    bigram_counter = pickle.load(bigram_file)
    trigram_counter = pickle.load(trigram_file)    
    tagged_unigram_counter = pickle.load(tagged_unigram_file)
    tagged_bigram_counter = pickle.load(tagged_bigram_file)
    tagged_trigram_counter = pickle.load(tagged_trigram_file)    

In [3]:
test_corpus = []
with open('../data/test_v2-preprocessed.txt','rt') as file:
    for line in file:
        test_corpus.append(line.rstrip().split(' '))

# COLLOCATIONS

In [4]:
# JJ - adjective or numeral, JJR  - adjective, JJS - adjective, superlative
adjs = ['JJ','JJR','JJS']
# NN noun, common; NNP - noun, proper, sing.; NNPS - noun, proper, plural; NNS - noun, common, plural
nouns = ['NN','NNS','NNP','NNPS']
tag_combs = []
for noun in nouns:
    tag_combs += [(adj, noun) for adj in adjs] # A,N
    for adj in adjs:
        for noun_ in nouns:
            tag_combs.append((adj, noun, noun_)) # A, N, N
            tag_combs.append((noun, adj, noun_)) # N, A, N
        tag_combs += [(adj, adj_, noun) for adj_ in adjs] # A, A, N
    for noun_ in nouns:
        tag_combs += [(noun, noun_, noun__) for noun__ in nouns] # N, N, N
        tag_combs += [(noun, 'IN', noun_)] # N, P, N
    tag_combs += [(noun_, noun) for noun_ in nouns] # N, N

In [5]:
def get_most_common(test_func, tagged_ngram_counter, reverse=True, m=slice(0,20)):
    most_common = sorted(test_func, key=test_func.get, reverse=reverse)[m]
    print("               ngram              |        tags        |  count  |   test   ")
    for tagged_ngram in most_common:
        ngram, tags = tagged_ngram
        print(f"{repr(ngram):^34}|{repr(tags):^20}|{repr(tagged_ngram_counter[tagged_ngram]):^9}|{test_func[tagged_ngram]:^8.1f}")

## T-test

<div style="font-size: 20px">
$$t = \frac{\bar{X} - \mu}{\sqrt{\frac{s^2}{N}}}$$
<br><div style="font-size:15px">
$\mu=p(w1,w2)=p(w1)*p(w2)=\frac{count(w1)}{n_{bigrams}}\frac{count(w2)}{n_{bigrams}}$
<br>$\bar{X}=p(w1,w2)=\frac{count(w1,w2)}{n_{bigrams}}$
<br>$s^2=p(w1,w2)*\left(1-p(w1,w2)\right)\approx p(w1,w2),$
<br>because probability of each bigram is very small $\Rightarrow (1-p) \approx 1$
    
    
$$t = \frac{\frac{count(w1,w2)}{n_{bigrams}} - \frac{count(w1)*count(w2)}{n_{bigrams}^2}}{\sqrt{\frac{count(w1,w2)}{n_{bigrams}^2}}} = \sqrt{count(w1,w2)} - \frac{count(w1)*count(w2)}{n*\sqrt{count(w1,w2)}}$$

## BIGRAMS

In [6]:
filtered_bigram_counter = Counter()
for tagged_bigram, count in tagged_bigram_counter.items():
    _, tags = tagged_bigram
    if tags in tag_combs:
        filtered_bigram_counter[tagged_bigram] = count
        
student_t2 = dict()
n_bigrams = sum(bigram_counter.values())
for tagged_bigram, count in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    student_t2[tagged_bigram] = math.sqrt(count) 
    student_t2[tagged_bigram] -= unigram_counter[w1] * unigram_counter[w2] / n_bigrams / math.sqrt(count)

In [7]:
get_most_common(student_t2, tagged_bigram_counter, reverse=True, m=slice(0,50))

               ngram              |        tags        |  count  |   test   
            'new york'            |    ('JJ', 'NN')    |  3320   |  57.5  
           'last year'            |    ('JJ', 'NN')    |  2540   |  50.1  
         'united states'          |   ('JJ', 'NNS')    |  2456   |  49.5  
           'last week'            |    ('JJ', 'NN')    |  1610   |  39.9  
         'prime minister'         |    ('JJ', 'NN')    |  1326   |  36.4  
          'los angeles'           |   ('NN', 'NNS')    |  1101   |  33.2  
           'last month'           |    ('JJ', 'NN')    |  1054   |  32.3  
          'white house'           |    ('JJ', 'NN')    |  1029   |  32.0  
           'first time'           |    ('JJ', 'NN')    |   958   |  30.6  
          'barack obama'          |    ('NN', 'NN')    |   919   |  30.3  
        'chief executive'         |    ('JJ', 'NN')    |   898   |  29.9  
          'health care'           |    ('NN', 'NN')    |   774   |  27.8  
             'i don'   

In [26]:
get_most_common(student_t2, tagged_bigram_counter, reverse=False, m=slice(0,50))

               ngram              |        tags        |  count  |   test   
            'other i'             |    ('JJ', 'NN')    |    1    | -24.4  
             'i time'             |    ('NN', 'NN')    |    1    | -21.4  
          'government i'          |    ('NN', 'NN')    |    1    | -19.4  
          'president i'           |    ('NN', 'NN')    |    1    | -15.7  
          'people year'           |   ('NNS', 'NN')    |    1    | -15.5  
          'year people'           |   ('NN', 'NNS')    |    1    | -15.5  
            'new time'            |    ('JJ', 'NN')    |    1    | -15.4  
            'i people'            |   ('NN', 'NNS')    |    3    | -15.3  
         'people people'          |   ('NNS', 'NNS')   |    1    | -15.1  
          'last people'           |   ('JJ', 'NNS')    |    1    | -14.7  
           'company i'            |    ('NN', 'NN')    |    1    | -14.3  
            'state i'             |    ('NN', 'NN')    |    1    | -13.7  
             'get i'   

## TRIGRAMS

In [8]:
filtered_trigram_counter = Counter()
for tagged_trigram, count in tagged_trigram_counter.items():
    _, tags = tagged_trigram
    if tags in tag_combs:
        filtered_trigram_counter[tagged_trigram] = count
        
student_t3 = dict()
n_trigrams = sum(trigram_counter.values())
for tagged_trigram, count in filtered_trigram_counter.items():
    trigram, _ = tagged_trigram
    w1, w2, w3 = trigram.split()
    student_t3[tagged_trigram] = math.sqrt(count) 
    student_t3[tagged_trigram] -= unigram_counter[w1] * unigram_counter[w2] * \
        unigram_counter[w3] / (n_trigrams * math.sqrt(count))

In [9]:
get_most_common(student_t3, tagged_trigram_counter, m=slice(0,50))

               ngram              |        tags        |  count  |   test   
     'hillary rodham clinton'     | ('JJ', 'NN', 'NN') |   135   |  9.9   
             '* * *'              |('JJ', 'NNP', 'NN') |   81    |  8.2   
        'osama bin laden'         | ('JJ', 'NN', 'NN') |   65    |  8.0   
         'george w. bush'         | ('NN', 'NN', 'NN') |   194   |  7.9   
             '= = ='              |('JJ', 'NNP', 'NN') |   55    |  7.4   
     'gross domestic product'     | ('JJ', 'JJ', 'NN') |   85    |  7.3   
       'pope benedict xvi'        | ('NN', 'NN', 'NN') |   49    |  7.0   
          'san suu kyi'           | ('JJ', 'NN', 'NN') |   51    |  6.8   
          'aung san suu'          | ('JJ', 'NN', 'NN') |   48    |  6.8   
      'speaker nancy pelosi'      | ('NN', 'NN', 'NN') |   47    |  6.7   
    'greenhouse gas emissions'    |('NN', 'NN', 'NNS') |   74    |  6.6   
    'chancellor angela merkel'    | ('NN', 'NN', 'NN') |   44    |  6.5   
       'nobel peace pri

In [27]:
get_most_common(student_t3, tagged_trigram_counter, reverse=False, m=slice(0,50))

               ngram              |        tags        |  count  |   test   
         'police in have'         | ('NN', 'IN', 'NN') |    1    |-3552082.9
        'people of people'        |('NNS', 'IN', 'NNS')|    1    |-2842770.3
            'i that i'            | ('NN', 'IN', 'NN') |    2    |-2684572.1
           'city of i'            | ('NN', 'IN', 'NN') |    1    |-2097445.4
           'i in back'            | ('NN', 'IN', 'NN') |    1    |-2075549.2
       'year of government'       | ('NN', 'IN', 'NN') |    1    |-2011519.2
          'time in year'          | ('NN', 'IN', 'NN') |    1    |-1889234.9
        'people of world'         |('NNS', 'IN', 'NN') |    1    |-1727302.4
      'people in government'      |('NNS', 'IN', 'NN') |    1    |-1679840.3
         'state in about'         | ('NN', 'IN', 'NN') |    1    |-1657291.7
       'company of people'        |('NN', 'IN', 'NNS') |    1    |-1473511.8
         'year of state'          | ('NN', 'IN', 'NN') |    1    |-1453886.1

<div style="font-size: 20px"> 
$\chi^2$ test
<div style="font-size: 15px"> 
<br>In the simplest case, the $\chi^2$ test is applied to 2-by-2 tables like table below. The essence of the test is to compare the observed frequencies in the table with the frequencies expected for independence. If the difference between observed and expected frequencies is large, then we can reject the null hypothesis of independence.

|  | $w1 = new$ | $w1 \neq new$ |
| --- | --- | --- |
| $w2 = companies$ | 8 | 4667 |
| $w2 \neq companies$ | 15820 | 142871818 |

<br> Take the statistics $X^2 = \sum_{i,j} \frac{(O_{ij}-E_{ij})^2}{E_{ij}}$
<br> If $H_0$ is true, i.e. each of words in collocation is generated completely independently of the other, the quantity $X^2$ is asymptotically $\chi^2$ distributed.
<br> The $\chi^2$ test can be applied to tables of any size, but it has a simpler form for 2-by-2 tables:
<br> $\chi^2 = \frac{N*(O_{11}O_{22} - O_{12}O_{21})}{(O_{11}+O_{12})(O_{11}+O_{21})(O_{12}+O_{22})(O_{21}+O_{22})}$
<br>$\chi^2_{0.05} = 3.841$


In [10]:
from collections import defaultdict

def get_ngrams(sentence, n):
    for i in range(len(sentence) - n + 1):
        bigram = sentence[i:i+n]
        yield bigram
        yield bigram[::-1]
        
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))
for step, line in enumerate(test_corpus):
    for w1, w2 in get_ngrams(line, 2):
        bigram_model[w1][w2] += 1

In [11]:
chi_sq = dict()
for tagged_bigram, _ in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    o11 = bigram_model[w1][w2]
    o12 = sum(bigram_model[w2].values()) - o11
    o21 = sum(bigram_model[w1].values()) - o11
    o22 = n_bigrams - o11 - o12 - o21
    chi_sq[tagged_bigram] = n_bigrams * (o11 * o22 - o12 * o21)**2
    chi_sq[tagged_bigram] /= ((o11 + o12)*(o11 + o21)*(o12 + o22)*(o21 + o22))

In [24]:
get_most_common(chi_sq, tagged_bigram_counter)

               ngram              |        tags        |  count  |   test   
      'esclerosis mгєltiple'      |    ('NN', 'NN')    |    1    |7227116.0
         '-great -great'          |    ('NN', 'NN')    |    5    |5018827.2
     'tapeshwar vishwakarma'      |    ('NN', 'NN')    |    1    |3613557.5
           'zuo lianbi'           |    ('NN', 'NN')    |    1    |3613557.5
         'soud baкјalawy'         |    ('NN', 'NN')    |    1    |3613557.5
         'jaysuma saidy'          |    ('NN', 'NN')    |    1    |3613557.5
   'noluthando mayende-sibiya'    |    ('JJ', 'NN')    |    1    |3613557.5
           'rxc7 bxc7'            |    ('NN', 'NN')    |    1    |3613557.5
           'kola liadi'           |    ('NN', 'NN')    |    1    |3613557.5
           'igam ogam'            |    ('NN', 'NN')    |    1    |3613557.5
      'diarmaid macculloch'       |    ('NN', 'NN')    |    1    |3613557.5
         'maka chawoneka'         |    ('NN', 'NN')    |    1    |3613557.5
          '

In [28]:
get_most_common(chi_sq, tagged_bigram_counter, reverse=False)

               ngram              |        tags        |  count  |   test   
            'tv media'            |   ('NN', 'NNS')    |    1    |  0.0   
      'largest development'       |   ('JJS', 'NN')    |    1    |  0.0   
        'several football'        |    ('JJ', 'NN')    |    1    |  0.0   
       'several elections'        |   ('JJ', 'NNS')    |    1    |  0.0   
          'india share'           |    ('NN', 'NN')    |    1    |  0.0   
          'enough cases'          |   ('JJ', 'NNS')    |    1    |  0.0   
           'watch show'           |    ('NN', 'NN')    |    1    |  0.0   
          'english game'          |    ('JJ', 'NN')    |    1    |  0.0   
        'provide building'        |    ('NN', 'NN')    |    1    |  0.0   
      'administration shows'      |   ('NN', 'NNS')    |    1    |  0.0   
          'many resort'           |    ('JJ', 'NN')    |    1    |  0.0   
         'many manhattan'         |   ('JJ', 'NNS')    |    1    |  0.0   
          'buy britain'

### let's see CHI-SQUARED test result for more frequent bigrams

In [25]:
filtered_cs2 = {i:j for i,j in chi_sq.items() if tagged_bigram_counter[i]>160}
get_most_common(filtered_cs2, tagged_bigram_counter, m=slice(0,50))

               ngram              |        tags        |  count  |   test   
          'los angeles'           |   ('NN', 'NNS')    |  1101   |1809628.7
           'hong kong'            |    ('NN', 'NN')    |   322   |1646998.1
            'пїѕ пїѕ'             |    ('NN', 'NN')    |   379   |1633486.1
           'las vegas'            |   ('NNS', 'NNS')   |   218   |1369042.7
          'saudi arabia'          |    ('NN', 'NN')    |   167   |987565.1
         'prime minister'         |    ('JJ', 'NN')    |  1326   |788904.2
           'swine flu'            |    ('NN', 'NN')    |   243   |749368.7
         'united states'          |   ('JJ', 'NNS')    |  2456   |734164.5
         'san francisco'          |    ('JJ', 'NN')    |   403   |629484.9
            'al qaeda'            |    ('NN', 'NN')    |   245   |612394.8
   'forward-looking statements'   |   ('JJ', 'NNS')    |   249   |570793.5
          '/ prnewswire'          |    ('NN', 'NN')    |   306   |439006.0
          'prnewswi

<div style="font-size: 20px">
Max Likelihood test

In [14]:
import numpy as np

In [15]:
max_likelihood2 = dict()
for tagged_bigram, c12 in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    c1, c2 = unigram_counter[w1], unigram_counter[w2] 
#     c1, c2, c12 = sum(bigram_model[w1].values()), sum(bigram_model[w2].values()), bigram_model[w1][w2]
    max_likelihood2[tagged_bigram] = np.log(c12 * n_bigrams / c1 / c2)

In [16]:
get_most_common(max_likelihood2, tagged_bigram_counter, reverse=True)

               ngram              |        tags        |  count  |   test   
         'safwat hijazi'          |    ('NN', 'NN')    |    1    |  15.8  
        'sadafumi kawato'         |    ('NN', 'NN')    |    1    |  15.8  
          'katell djian'          |    ('NN', 'NN')    |    1    |  15.8  
       'simгіn bolг\xadvar'       |    ('NN', 'NN')    |    1    |  15.8  
      'super-chic workboot'       |    ('JJ', 'NN')    |    1    |  15.8  
        'rajne soderberg'         |    ('NN', 'NN')    |    1    |  15.8  
     'tapeshwar vishwakarma'      |    ('NN', 'NN')    |    1    |  15.8  
           'zuo lianbi'           |    ('NN', 'NN')    |    1    |  15.8  
        'taichiro kiyota'         |    ('NN', 'NN')    |    1    |  15.8  
   'receptor-associated kinase'   |    ('JJ', 'NN')    |    1    |  15.8  
    'french-trained gendarme'     |    ('JJ', 'NN')    |    1    |  15.8  
       'veltliner kamptal'        |    ('NN', 'NN')    |    1    |  15.8  
        'prгєt-г -porte

In [17]:
max_likelihood3 = dict()
for tagged_trigram, c123 in filtered_trigram_counter.items():
    trigram, _ = tagged_trigram
    s23 = trigram.rfind(' ')
    w12, w3 = trigram[:s23], trigram[s23+1:]
    c12, c3 = bigram_counter[w12] , unigram_counter[w3] 
    max_likelihood3[tagged_trigram] = np.log(c123 * n_bigrams / c12 / c3)

In [18]:
get_most_common(max_likelihood3, tagged_trigram_counter)

               ngram              |        tags        |  count  |   test   
  'passage between livestockism'  | ('NN', 'IN', 'NN') |    1    |  15.8  
         'qadri al ahdal'         | ('NN', 'NN', 'NN') |    1    |  15.8  
        'samir al quntar'         | ('NN', 'NN', 'NN') |    1    |  15.8  
       'pit near chessell'        | ('NN', 'IN', 'NN') |    1    |  15.8  
        'salon du dessin'         | ('NN', 'NN', 'NN') |    1    |  15.8  
    'technology website heise'    |('NN', 'NNS', 'NN') |    1    |  15.8  
  'euromeetings president rajne'  |('NNS', 'NN', 'NN') |    1    |  15.8  
    'general anthony krastek'     | ('JJ', 'NN', 'NN') |    1    |  15.8  
    'professor donald macrae'     | ('NN', 'NN', 'NN') |    1    |  15.8  
   'traditional inupiat eskimo'   | ('JJ', 'NN', 'NN') |    1    |  15.8  
'interleukin-1 receptor-associated kinase'| ('JJ', 'JJ', 'NN') |    1    |  15.8  
       'design new get-ups'       |('NN', 'JJ', 'NNS') |    1    |  15.8  
 'former french

### let's see MAX LIKELIHOOD test result for more frequent bigrams and trigrams

In [19]:
filtered_ml2 = {i:j for i,j in max_likelihood2.items() if tagged_bigram_counter[i]>160}
get_most_common(filtered_ml2, tagged_bigram_counter, m=slice(0,50))

               ngram              |        tags        |  count  |   test   
           'las vegas'            |   ('NNS', 'NNS')   |   218   |  10.1  
          'saudi arabia'          |    ('NN', 'NN')    |   167   |  10.0  
           'hong kong'            |    ('NN', 'NN')    |   322   |  9.8   
           'swine flu'            |    ('NN', 'NN')    |   243   |  9.4   
            'al qaeda'            |    ('NN', 'NN')    |   245   |  9.2   
   'forward-looking statements'   |   ('JJ', 'NNS')    |   249   |  9.1   
           'st. louis'            |    ('NN', 'NN')    |   166   |  8.9   
          'los angeles'           |   ('NN', 'NNS')    |  1101   |  8.7   
         'san francisco'          |    ('JJ', 'NN')    |   403   |  8.7   
           'san diego'            |    ('JJ', 'NN')    |   243   |  8.6   
           'dow jones'            |   ('NN', 'NNS')    |   172   |  8.6   
           'george w.'            |    ('NN', 'NN')    |   206   |  8.4   
            'пїѕ пїѕ'  

In [20]:
filtered_ml3 = {i:j for i,j in max_likelihood3.items() if tagged_trigram_counter[i]>25}
get_most_common(filtered_ml3, tagged_trigram_counter, m=slice(0,50))

               ngram              |        tags        |  count  |   test   
         'rio de janeiro'         | ('NN', 'IN', 'NN') |   33    |  12.0  
      'president hu jintao'       | ('NN', 'NN', 'NN') |   35    |  11.8  
 'secretary-general ban ki-moon'  | ('JJ', 'NN', 'NN') |   27    |  11.7  
          'kim jong il'           | ('NN', 'NN', 'NN') |   26    |  11.5  
           'вђљ г‚ в®'            |('JJ', 'NNP', 'NN') |   32    |  11.5  
       'pope benedict xvi'        | ('NN', 'NN', 'NN') |   49    |  11.4  
      'united arab emirates'      |('JJ', 'NN', 'NNS') |   52    |  11.2  
   'venezuelan president hugo'    | ('NN', 'NN', 'NN') |   29    |  11.2  
        'asif ali zardari'        | ('NN', 'NN', 'NN') |   31    |  11.0  
          'san suu kyi'           | ('JJ', 'NN', 'NN') |   51    |  11.0  
          'aung san suu'          | ('JJ', 'NN', 'NN') |   48    |  11.0  
          'gulf of aden'          | ('NN', 'IN', 'NN') |   27    |  11.0  
             '= = ='   

# nltk

In [21]:
from itertools import chain
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

words = list(chain.from_iterable(test_corpus))
bigram_finder = BigramCollocationFinder.from_words(words)

In [22]:
bigram_finder.nbest(BigramAssocMeasures.student_t, 100)

[('``', "''"),
 ('of', 'the'),
 ('in', 'the'),
 ('.', 'the'),
 (',', '``'),
 (',', 'which'),
 ('on', 'the'),
 (',', 'but'),
 ('said', '.'),
 ('to', 'be'),
 ('.', 'but'),
 ('at', 'the'),
 ('for', 'the'),
 (',', 'and'),
 ('.', 'it'),
 ('.', 'he'),
 (',', 'who'),
 ('.', 'i'),
 ('more', 'than'),
 ('has', 'been'),
 ('will', 'be'),
 ('have', 'been'),
 ('in', 'a'),
 ('from', 'the'),
 ('he', 'said'),
 ('it', 'was'),
 ('it', 'is'),
 ('it', "'s"),
 ('as', 'a'),
 ('the', 'first'),
 ('by', 'the'),
 ('one', 'of'),
 ('with', 'the'),
 ('he', 'was'),
 ('with', 'a'),
 ('new', 'york'),
 ('according', 'to'),
 ('.', 'we'),
 (')', '-'),
 ("''", 'he'),
 ("''", 'said'),
 ('the', 'world'),
 ('to', 'the'),
 ('would', 'be'),
 ('had', 'been'),
 ('for', 'a'),
 ('is', 'a'),
 ('don', "'t"),
 (',', 'including'),
 ('.', 'this'),
 ('last', 'year'),
 ('the', 'company'),
 ('united', 'states'),
 ('the', 'same'),
 ('over', 'the'),
 ('the', 'united'),
 ('(', 'ap'),
 ('ap', ')'),
 ('.', 'they'),
 ('out', 'of'),
 ('per', 'ce