Importing modules

In [5]:
import sys
import csv
import json
import os
import itertools

sys.path.append("..")

from functools import reduce
from src.text_chunk import TextChunk

Reading raw train texts

In [2]:
def get_data(path):
    data = []

    for i in itertools.count(start=1):
        try:
            text = open(os.path.join(path, 'problem-' + str(i) + '.txt'), 'r').read()
            data.append(TextChunk(text))
        except FileNotFoundError:
            break
    
    return data


train = get_data('../data/train_raw')

Get POSTAG trigram/fourgram counters for each text

In [3]:
trigram_counters = [text.all_trigrams for text in train]
fourgram_counters = [text.all_fourgrams for text in train]

Combine all counters to count total occurances of each trigram/fourgram

In [6]:
trigram_counts = reduce(lambda c1, c2: c1 + c2, trigram_counters)
fourgram_counts = reduce(lambda c1, c2: c1 + c2, fourgram_counters)

Get 100 most common trigrams and 100 most common fourgrams

In [8]:
most_common_trigrams = trigram_counts.most_common(100)
most_common_trigrams = list(map(lambda x: x[0], most_common_trigrams))

most_common_fourgrams = fourgram_counts.most_common(100)
most_common_fourgrams = list(map(lambda x: x[0], most_common_fourgrams))

Write most common tri/four-grams POS-tags to files

In [20]:
with open('../data/pos_tag_ngrams/most_common_pos_tag_trigrams.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    
    for trigrams in most_common_trigrams:
        wr.writerow(trigrams)
        
with open('../data/pos_tag_ngrams/most_common_pos_tag_fourgrams.csv', 'w') as myfile:
    wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
    
    for fourgrams in most_common_fourgrams:
        wr.writerow(fourgrams)

Check files:

In [21]:
with open('../data/pos_tag_ngrams/most_common_pos_tag_trigrams.csv', 'r') as f:
    POS_TAG_TRIGRAMS = []
    reader = csv.reader(f)
    for line in reader:
        POS_TAG_TRIGRAMS.append(tuple(line))
        
with open('../data/pos_tag_ngrams/most_common_pos_tag_fourgrams.csv', 'r') as f:
    POS_TAG_FOURGRAMS = []
    reader = csv.reader(f)
    for line in reader:
        POS_TAG_FOURGRAMS.append(tuple(line))

In [22]:
POS_TAG_TRIGRAMS

[('IN', 'DT', 'NN'),
 ('DT', 'NN', 'IN'),
 ('DT', 'JJ', 'NN'),
 ('NN', 'IN', 'DT'),
 ('IN', 'DT', 'JJ'),
 ('JJ', 'NN', 'IN'),
 ('NN', 'IN', 'NN'),
 ('DT', 'NN', 'NN'),
 ('VB', 'DT', 'NN'),
 ('PRP', 'MD', 'VB'),
 ('DT', 'NN', '__END__'),
 ('IN', 'DT', 'NNP'),
 ('NNP', 'NNP', 'NNP'),
 ('DT', 'NN', 'VBZ'),
 ('TO', 'VB', 'DT'),
 ('JJ', 'NN', '__END__'),
 ('NNS', 'IN', 'DT'),
 ('NN', 'TO', 'VB'),
 ('IN', 'PRP', 'VBP'),
 ('VBZ', 'DT', 'NN'),
 ('IN', 'DT', 'NNS'),
 ('NN', 'IN', 'NNP'),
 ('JJ', 'NNS', 'IN'),
 ('VBN', 'IN', 'DT'),
 ('JJ', 'NN', 'NN'),
 ('IN', 'JJ', 'NNS'),
 ('IN', 'NN', 'IN'),
 ('NN', 'IN', 'JJ'),
 ('DT', 'NN', 'CC'),
 ('__START__', 'DT', 'NN'),
 ('IN', 'PRP$', 'NN'),
 ('IN', 'JJ', 'NN'),
 ('NN', 'NN', '__END__'),
 ('DT', 'NNP', 'NNP'),
 ('TO', 'VB', 'IN'),
 ('NN', 'MD', 'VB'),
 ('NN', 'NN', 'IN'),
 ('IN', 'NNP', 'NNP'),
 ('NN', 'IN', 'PRP'),
 ('__START__', 'IN', 'DT'),
 ('NN', 'IN', 'NNS'),
 ('DT', 'NNS', 'IN'),
 ('MD', 'RB', 'VB'),
 ('NN', 'CC', 'NN'),
 ('RB', 'IN', 'DT'),
 (

In [23]:
POS_TAG_FOURGRAMS

[('NN', 'IN', 'DT', 'NN'),
 ('IN', 'DT', 'NN', 'IN'),
 ('IN', 'DT', 'JJ', 'NN'),
 ('DT', 'NN', 'IN', 'DT'),
 ('DT', 'JJ', 'NN', 'IN'),
 ('DT', 'NN', 'IN', 'NN'),
 ('JJ', 'NN', 'IN', 'DT'),
 ('NN', 'IN', 'DT', 'JJ'),
 ('IN', 'DT', 'NN', '__END__'),
 ('IN', 'DT', 'NN', 'NN'),
 ('TO', 'VB', 'DT', 'NN'),
 ('DT', 'JJ', 'NN', '__END__'),
 ('VB', 'DT', 'NN', 'IN'),
 ('NNS', 'IN', 'DT', 'NN'),
 ('NNP', 'NNP', 'NNP', 'NNP'),
 ('VBZ', 'DT', 'JJ', 'NN'),
 ('VBZ', 'DT', 'NN', 'IN'),
 ('VB', 'DT', 'JJ', 'NN'),
 ('VBN', 'IN', 'DT', 'NN'),
 ('IN', 'DT', 'NN', 'VBZ'),
 ('DT', 'JJ', 'NN', 'NN'),
 ('NN', 'IN', 'DT', 'NNP'),
 ('DT', 'NN', 'IN', 'NNP'),
 ('IN', 'DT', 'NNP', 'NNP'),
 ('IN', 'DT', 'NN', 'CC'),
 ('DT', 'NN', 'IN', 'JJ'),
 ('JJ', 'NN', 'IN', 'NN'),
 ('DT', 'NN', 'TO', 'VB'),
 ('DT', 'NN', 'IN', 'NNS'),
 ('__START__', 'IN', 'DT', 'NN'),
 ('NN', 'IN', 'JJ', 'NN'),
 ('NN', 'IN', 'NN', 'IN'),
 ('RB', 'IN', 'DT', 'NN'),
 ('IN', 'NN', 'IN', 'DT'),
 ('MD', 'VB', 'DT', 'NN'),
 ('DT', 'JJ', 'NN', 'VBZ