In [1]:
import nltk
import re
from collections import Counter
from itertools import islice
import pickle
import math

## test corpus

In [18]:
with open('../data/unigram_counter_test_str.pickle', 'rb') as unigram_file, \
        open('../data/bigram_counter_test_str.pickle', 'rb') as bigram_file, \
        open('../data/trigram_counter_test_str.pickle', 'rb') as trigram_file,\
        open('../data/tagged_unigrams_counter_test_str.pickle', 'rb') as tagged_unigram_file, \
        open('../data/tagged_bigrams_counter_test_str.pickle', 'rb') as tagged_bigram_file, \
        open('../data/tagged_trigrams_counter_test_str.pickle', 'rb') as tagged_trigram_file:
    unigram_counter = pickle.load(unigram_file)
    bigram_counter = pickle.load(bigram_file)
    trigram_counter = pickle.load(trigram_file)    
    tagged_unigram_counter = pickle.load(tagged_unigram_file)
    tagged_bigram_counter = pickle.load(tagged_bigram_file)
    tagged_trigram_counter = pickle.load(tagged_trigram_file)    

In [2]:
test_corpus = []
with open('../data/test_v2-preprocessed.txt','rt') as file:
    for line in file:
        test_corpus.append(line.split(' '))

# COLLOCATIONS

In [19]:
# JJ - adjective or numeral, JJR  - adjective, JJS - adjective, superlative
adjs = ['JJ','JJR','JJS']
# NN noun, common; NNP - noun, proper, sing.; NNPS - noun, proper, plural; NNS - noun, common, plural
nouns = ['NN','NNS','NNP','NNPS']
tag_combs = []
for noun in nouns:
    tag_combs += [(adj, noun) for adj in adjs] # A,N
    for adj in adjs:
        for noun_ in nouns:
            tag_combs.append((adj, noun, noun_)) # A, N, N
            tag_combs.append((noun, adj, noun_)) # N, A, N
        tag_combs += [(adj, adj_, noun) for adj_ in adjs] # A, A, N
    for noun_ in nouns:
        tag_combs += [(noun, noun_, noun__) for noun__ in nouns] # N, N, N
        tag_combs += [(noun, 'IN', noun_)] # N, P, N
    tag_combs += [(noun_, noun) for noun_ in nouns] # N, N

In [101]:
def get_most_common(test_func, tagged_ngram_counter, reverse=True, m=slice(0,20)):
    most_common = sorted(test_func, key=test_func.get, reverse=reverse)[m]
    print("               ngram              |  count  |   test   ")
    for tagged_ngram in most_common:
        ngram, _ = tagged_ngram
        print(f"{repr(ngram):^34}|{repr(tagged_ngram_counter[tagged_ngram]):^9}|{test_func[tagged_ngram]:^8.1f}")

## T-test

<div style="font-size: 20px">
$$t = \frac{\bar{X} - \mu}{\sqrt{\frac{s^2}{N}}}$$
<br><div style="font-size:15px">
$\mu=p(w1,w2)=p(w1)*p(w2)=\frac{count(w1)}{n_{bigrams}}\frac{count(w2)}{n_{bigrams}}$
<br>$\bar{X}=p(w1,w2)=\frac{count(w1,w2)}{n_{bigrams}}$
<br>$s^2=p(w1,w2)*\left(1-p(w1,w2)\right)\approx p(w1,w2),$
<br>because probability of each bigram is very small $\Rightarrow (1-p) \approx 1$
    
    
$$t = \frac{\frac{count(w1,w2)}{n_{bigrams}} - \frac{count(w1)*count(w2)}{n_{bigrams}^2}}{\sqrt{\frac{count(w1,w2)}{n_{bigrams}^2}}} = \sqrt{count(w1,w2)} - \frac{count(w1)*count(w2)}{n*\sqrt{count(w1,w2)}}$$

## BIGRAMS

In [23]:
filtered_bigram_counter = Counter()
for tagged_bigram, count in tagged_bigram_counter.items():
    _, tags = tagged_bigram
    if tags in tag_combs:
        filtered_bigram_counter[tagged_bigram] = count
        
student_t2 = dict()
n_bigrams = sum(bigram_counter.values())
for tagged_bigram, count in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    student_t2[tagged_bigram] = math.sqrt(count) 
    student_t2[tagged_bigram] -= unigram_counter[w1] * unigram_counter[w2] / n_bigrams / math.sqrt(count)

In [102]:
get_most_common(student_t2, tagged_bigram_counter, reverse=True, m=slice(0,100))

               ngram              |  count  |   test   
            'new york'            |  3290   |  57.2  
             "don 't"             |    9    |  51.2  
             "don 't"             |   27    |  51.2  
         'united states'          |  2429   |  49.4  
           'last year'            |  2540   |  49.3  
            'per cent'            |   22    |  47.8  
           'last week'            |  1610   |  39.5  
            "didn 't"             |   22    |  37.2  
         'prime minister'         |  1322   |  36.4  
            'i think'             |    7    |  34.1  
          'los angeles'           |   486   |  33.2  
          'los angeles'           |   362   |  33.2  
           'last month'           |  1054   |  32.1  
            "doesn 't"            |   15    |  32.0  
          'white house'           |  1029   |  31.9  
          'barack obama'          |   801   |  30.2  
          'barack obama'          |   25    |  30.2  
        'chief executive' 

## TRIGRAMS

In [25]:
filtered_trigram_counter = Counter()
for tagged_trigram, count in tagged_trigram_counter.items():
    _, tags = tagged_trigram
    if tags in tag_combs:
        filtered_trigram_counter[tagged_trigram] = count
        
student_t3 = dict()
n_trigrams = sum(trigram_counter.values())
for tagged_trigram, count in filtered_trigram_counter.items():
    trigram, _ = tagged_trigram
    w1, w2, w3 = trigram.split()
    student_t3[tagged_trigram] = math.sqrt(count) 
    student_t3[tagged_trigram] -= unigram_counter[w1] * unigram_counter[w2] * \
        unigram_counter[w3] / (n_trigrams * math.sqrt(count))

In [103]:
get_most_common(student_t3, tagged_trigram_counter, m=slice(0,40))

               ngram              |  count  |   test   
     'hillary rodham clinton'     |   129   |  9.7   
     'gross domestic product'     |   85    |  7.3   
             '= = ='              |   52    |  7.2   
    'greenhouse gas emissions'    |   70    |  6.4   
    'chancellor angela merkel'    |   41    |  6.3   
      'speaker nancy pelosi'      |   39    |  6.1   
        'osama bin laden'         |   37    |  6.0   
             '* * *'              |   45    |  5.7   
         'annum + bonus'          |   32    |  5.3   
     'nasdaq composite index'     |   31    |  5.2   
         'rio de janeiro'         |   28    |  5.1   
     'chairman ben bernanke'      |   47    |  5.1   
       'nobel peace prize'        |   38    |  5.1   
           'вђљ г‚ в®'            |   24    |  4.9   
      'minister ehud olmert'      |   28    |  4.9   
    'german chancellor angela'    |   30    |  4.9   
          'san suu kyi'           |   27    |  4.8   
 'secretary-general ban ki

$\chi^2$

In [31]:
from collections import defaultdict

def get_ngrams(sentence, n):
    for i in range(len(sentence) - n + 1):
        bigram = sentence[i:i+n]
        yield bigram
        yield bigram[::-1]
        
bigram_model = defaultdict(lambda: defaultdict(lambda: 0))
for step, line in enumerate(test_corpus):
    for w1, w2 in get_ngrams(line, 2):
        bigram_model[w1][w2] += 1

working on 306kth line

In [32]:
chi_sq = dict()
for tagged_bigram, _ in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    o11 = bigram_model[w1][w2]
    o12 = sum(bigram_model[w2].values()) - o11
    o21 = sum(bigram_model[w1].values()) - o11
    o22 = n_bigrams - o11 - o12 - o21
    chi_sq[tagged_bigram] = n_bigrams * (o11 * o22 - o12 * o21)**2
    chi_sq[tagged_bigram] /= ((o11 + o12)*(o11 + o21)*(o12 + o22)*(o21 + o22))

In [35]:
get_most_common(chi_sq, tagged_bigram_counter, reverse=False)

               ngram              |  count  |   test   
        'more immigrants'         |    1    |  0.0   
          'health firms'          |    1    |  0.0   
         'hospital today'         |    2    |  0.0   
           'long movie'           |    1    |  0.0   
          'family movie'          |    1    |  0.0   
         'strategy game'          |    1    |  0.0   
       'organization money'       |    1    |  0.0   
          'york streets'          |    1    |  0.0   
          'economy cars'          |    1    |  0.0   
         'review system'          |    1    |  0.0   
           'ap reports'           |    1    |  0.0   
        'cost management'         |    1    |  0.0   
         'weapons school'         |    1    |  0.0   
          'average per'           |    1    |  0.0   
       'american standards'       |    1    |  0.0   
          'way airlines'          |    1    |  0.0   
           'same hour'            |    1    |  0.0   
          'job officer'   

In [36]:
get_most_common(chi_sq, tagged_bigram_counter, reverse=True)

               ngram              |  count  |   test   
         '-great -great'          |    1    |5239824.4
         '-great -great'          |    1    |5239824.4
         '-great -great'          |    3    |5239824.4
     'tapeshwar vishwakarma'      |    1    |3772675.5
           'zuo lianbi'           |    1    |3772675.5
         'soud baкјalawy'         |    1    |3772675.5
         'jaysuma saidy'          |    1    |3772675.5
   'noluthando mayende-sibiya'    |    1    |3772675.5
           'rxc7 bxc7'            |    1    |3772675.5
           'kola liadi'           |    1    |3772675.5
           'igam ogam'            |    1    |3772675.5
      'diarmaid macculloch'       |    1    |3772675.5
         'maka chawoneka'         |    1    |3772675.5
          'sajjan gohel'          |    1    |3772675.5
          'senny manave'          |    1    |3772675.5
        'krysztof wiecek'         |    1    |3772675.5
       'ladenburg thalmann'       |    1    |3772675.5
       'l

# likelyhood

In [51]:
import numpy as np

In [79]:
max_likelyhood2 = dict()
for tagged_bigram, c12 in filtered_bigram_counter.items():
    bigram, _ = tagged_bigram
    w1, w2 = bigram.split()
    c1, c2 = unigram_counter[w1], unigram_counter[w2] 
#     c1, c2, c12 = sum(bigram_model[w1].values()), sum(bigram_model[w2].values()), bigram_model[w1][w2]
    max_likelyhood2[tagged_bigram] = np.log(c12 * n_bigrams / c1 / c2)

In [80]:
get_most_common(max_likelyhood2, tagged_bigram_counter, reverse=True)

               ngram              |  count  |   test   
         'safwat hijazi'          |    1    |  15.8  
        'sadafumi kawato'         |    1    |  15.8  
       'simгіn bolг\xadvar'       |    1    |  15.8  
      'super-chic workboot'       |    1    |  15.8  
        'rajne soderberg'         |    1    |  15.8  
     'tapeshwar vishwakarma'      |    1    |  15.8  
           'zuo lianbi'           |    1    |  15.8  
        'taichiro kiyota'         |    1    |  15.8  
   'receptor-associated kinase'   |    1    |  15.8  
    'french-trained gendarme'     |    1    |  15.8  
        'prгєt-г -porter'         |    1    |  15.8  
          'antti niemi'           |    1    |  15.8  
         'soud baкјalawy'         |    1    |  15.8  
         'brimin kipruto'         |    1    |  15.8  
 "'brien-trained four-year-olds"  |    1    |  15.8  
         'emneth hungate'         |    1    |  15.8  
         'amancio ortego'         |    1    |  15.8  
          'ioana raluca'  

In [56]:
max_likelyhood3 = dict()
for tagged_trigram, c123 in filtered_trigram_counter.items():
    trigram, _ = tagged_trigram
    s23 = trigram.rfind(' ')
    w12, w3 = trigram[:s23], trigram[s23+1:]
    c12, c3 = bigram_counter[w12] , unigram_counter[w3] 
    max_likelyhood3[tagged_trigram] = np.log(c123 * n_bigrams / c12 / c3)

In [57]:
get_most_common(max_likelyhood3, tagged_trigram_counter)

               ngram              |  count  |   test   
  'passage between livestockism'  |    1    |  15.8  
         'qadri al ahdal'         |    1    |  15.8  
        'samir al quntar'         |    1    |  15.8  
       'pit near chessell'        |    1    |  15.8  
       'lars ole orjaseter'       |    1    |  15.8  
        'salon du dessin'         |    1    |  15.8  
    'technology website heise'    |    1    |  15.8  
         'roche / chugai'         |    1    |  15.8  
  'euromeetings president rajne'  |    1    |  15.8  
   'president rajne soderberg'    |    1    |  15.8  
    'general anthony krastek'     |    1    |  15.8  
    'professor donald macrae'     |    1    |  15.8  
       'alila villas soori'       |    1    |  15.8  
'interleukin-1 receptor-associated kinase'|    1    |  15.8  
       'design new get-ups'       |    1    |  15.8  
 'former french-trained gendarme' |    1    |  15.8  
     'paris prгєt-г -porter'      |    1    |  15.8  
      'general tom

# nltk

In [48]:
from itertools import chain
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures

words = list(chain.from_iterable(test_corpus))
bigram_finder = BigramCollocationFinder.from_words(words)

In [90]:
bigram_finder.nbest(BigramAssocMeasures.student_t, 100)

[('.', "''\n"),
 ('``', "''"),
 ('of', 'the'),
 ('in', 'the'),
 ("''\n", 'the'),
 (',', '``'),
 ('on', 'the'),
 ("''\n", 'but'),
 (',', 'which'),
 (',', 'but'),
 ('said', '.'),
 ('to', 'be'),
 (',', 'and'),
 ('at', 'the'),
 ('for', 'the'),
 ("''\n", 'it'),
 ("''\n", 'he'),
 ("''\n", 'i'),
 (',', 'who'),
 ('?', "''\n"),
 ('more', 'than'),
 ('has', 'been'),
 ('will', 'be'),
 ('in', 'a'),
 ('have', 'been'),
 ('from', 'the'),
 ('he', 'said'),
 ('it', 'was'),
 ('it', 'is'),
 ('it', "'s"),
 ('as', 'a'),
 ('the', 'first'),
 ('by', 'the'),
 ('with', 'the'),
 ('one', 'of'),
 ('he', 'was'),
 ('with', 'a'),
 ('to', 'the'),
 ('new', 'york'),
 ("''\n", 'we'),
 ('according', 'to'),
 (')', '-'),
 ('the', 'world'),
 ("''", 'he'),
 ("''", 'said'),
 ('would', 'be'),
 ('had', 'been'),
 ('for', 'a'),
 ('is', 'a'),
 ('don', "'t"),
 ("''\n", 'this'),
 (',', 'including'),
 ('last', 'year'),
 ("''\n", 'they'),
 ('the', 'company'),
 ('the', 'same'),
 ('united', 'states'),
 ('over', 'the'),
 ('the', 'united'),
