In [1]:
import nltk
import re
from collections import Counter
from itertools import islice

## test corpus
### ngrams frequency

In [2]:
train_corpus = []
test_corpus = []
with open('../data/test_v2.txt','rt') as file:
    for line in file:
        comma_pos = line.find(',')
        words = re.sub('"', '', line[comma_pos+1:]).lower()
#         words = re.sub('[0-9]',"#",words)
        test_corpus.append(nltk.word_tokenize(words))

In [3]:
unigram_counter = Counter()
for step, line in enumerate(test_corpus):
    if not (step % 1000):
        print("working on {}kth line".format(step // 1000), end='\r')
    for token in line:
        unigram_counter[token] += 1

working on 306kth line

In [4]:
unigram_counter.most_common(30)

[('the', 402020),
 (',', 342277),
 ('.', 310630),
 ('to', 178207),
 ('of', 168734),
 ('a', 160164),
 ('and', 158065),
 ('in', 144187),
 ('that', 67490),
 ("'s", 66412),
 ('for', 65828),
 ('on', 57408),
 ('is', 54571),
 ('with', 45371),
 ('was', 45145),
 ('it', 44682),
 ('said', 42003),
 ('as', 39376),
 ('he', 38611),
 ('at', 37377),
 ('by', 33483),
 ('from', 32106),
 ('be', 30163),
 ('his', 29807),
 ('have', 29134),
 ('has', 28797),
 ('but', 27430),
 ('are', 27330),
 ('an', 24998),
 ('not', 23329)]

In [5]:
def get_windows(seq, n):
    it = iter(seq)
    result = list(islice(it, n))
    if len(result) == n:
        # using sort for context words because the tuples of the same words have to be equivalent
        output = sorted(result)
        yield tuple(output)
    for elem in it:
        result = result[1:] + [elem,]
        output = sorted(result)
        yield tuple(output)

In [6]:
bigram_counter = Counter()
for step, line in enumerate(test_corpus):
    if not (step % 1000):
        print("working on {}kth line".format(step // 1000), end='\r')
    for bigram in list(get_windows(line, 2)):
        bigram_counter[bigram] += 1

working on 306kth line

In [7]:
bigram_counter.most_common(30)

[(('of', 'the'), 40629),
 (('in', 'the'), 35643),
 ((',', 'the'), 25161),
 ((',', 'and'), 20033),
 (('the', 'to'), 16872),
 (('on', 'the'), 13265),
 (('for', 'the'), 13261),
 ((',', 'a'), 11769),
 (('at', 'the'), 10823),
 (('and', 'the'), 10781),
 (('.', 'said'), 10777),
 ((',', 'but'), 10552),
 ((',', 'said'), 10303),
 (('a', 'in'), 10025),
 ((',', 'which'), 9591),
 (('be', 'to'), 9196),
 (('a', 'of'), 7970),
 (('the', 'with'), 7961),
 ((',', 'who'), 7856),
 ((',', 'he'), 7799),
 (('that', 'the'), 7718),
 (('from', 'the'), 7638),
 (('he', 'said'), 7160),
 (('by', 'the'), 7086),
 (('a', 'as'), 5704),
 (('a', 'to'), 5446),
 ((',', 'it'), 5344),
 (('a', 'with'), 5342),
 (('a', 'for'), 5289),
 (('more', 'than'), 5263)]

In [8]:
trigram_counter = Counter()
for step, line in enumerate(test_corpus):
    if not (step % 1000):
        print("working on {}kth line".format(step // 1000), end='\r')
    for trigram in list(get_windows(line, 3)):
        trigram_counter[trigram] += 1

working on 306kth line

In [10]:
trigram_counter.most_common(30)

[((',', 'he', 'said'), 3985),
 (('of', 'one', 'the'), 2603),
 (('.', 'he', 'said'), 2533),
 ((',', 'according', 'to'), 2359),
 (('(', ')', 'ap'), 2275),
 (('states', 'the', 'united'), 2234),
 ((',', 'and', 'the'), 1989),
 (('end', 'of', 'the'), 1924),
 ((')', '-', 'ap'), 1789),
 (('as', 'as', 'well'), 1436),
 ((')', '--', 'upi'), 1289),
 (('(', ')', 'upi'), 1275),
 ((',', 'is', 'which'), 1182),
 ((',', 'but', 'the'), 1150),
 ((',', 'said', 'she'), 1146),
 (("'s", 'the', 'world'), 1116),
 ((',', 'said', 'the'), 1108),
 (('(', ')', 'reuters'), 1069),
 ((')', '-', 'reuters'), 1047),
 (('first', 'in', 'the'), 1036),
 (('according', 'the', 'to'), 1013),
 (('most', 'of', 'the'), 993),
 (('of', 'some', 'the'), 971),
 (('of', 'part', 'the'), 950),
 ((',', 'in', 'the'), 919),
 (('first', 'the', 'time'), 913),
 (("'s", 'company', 'the'), 876),
 (('first', 'for', 'the'), 863),
 (('able', 'be', 'to'), 860),
 (("'s", 'country', 'the'), 846)]