In [46]:
import pandas as pd
import numpy as np

In [47]:
raw_dataset = pd.read_csv('ner_dataset2.csv')
raw_dataset['Sentence Start'] = ~raw_dataset['Sentence #'].isna()
raw_dataset

Unnamed: 0,Sentence #,Word,POS,Tag,Sentence Start
0,Sentence: 1,Thousands,NNS,O,True
1,,of,IN,O,False
2,,demonstrators,NNS,O,False
3,,have,VBP,O,False
4,,marched,VBN,O,False
...,...,...,...,...,...
1048570,,they,PRP,O,False
1048571,,responded,VBD,O,False
1048572,,to,TO,O,False
1048573,,the,DT,O,False


In [48]:
# TODO: decide on start and end delimiters
tuple_start_delimiter = '<START>'
tuple_sep_delimiter = '<SEP>'
tuple_end_delimiter = '<END>'

def create_tuple_string_for_json(tup):
    tup_str = f'{tuple_start_delimiter}'
    for i in range(len(tup)):
        tup_str += tup[i]
        if i < len(tup)-1:
            tup_str += '<SEP>'
    tup_str += f'{tuple_end_delimiter}'
    return tup_str

In [49]:
# # test tuple function
# tup = ('(', '\'', ')')
# create_tuple_string_for_json(tup)

In [50]:
# index 0 is unigram, 1 is bigram, 2 is trigram, 3 is quadgram
# the keys of each dictionary are effectively a path to the occurence
#   for a unigram, this would be simply the current word
#   for higher n-grams, this would be the sequence of previous POS with the current word
# the values of each dictionary are also dictionaries
#   the key is the observed type
#   the value is the number of times this POS occurred
ngram_maps = [{}, {}, {}, {}]

previously_seen = []
for index, row in raw_dataset.iterrows():
    # reset to having seen nothing if new sentence starts
    if row['Sentence Start']:
        previously_seen = []
    
    # unigram
    if row['Word'] not in ngram_maps[0]:
        ngram_maps[0][row['Word']] = { row['POS']: 1 }
    else:
        if row['POS'] in ngram_maps[0][row['Word']]:
            ngram_maps[0][row['Word']][row['POS']] += 1
        else:
            ngram_maps[0][row['Word']][row['POS']] = 1

    # higher n-grams
    for n in range(1, len(ngram_maps)):
        # skip higher grams if not enough previously seen words in this sentence
        if len(previously_seen) < n:
            break

        # unpacks the last n elements into a tuple with word, then check if that pattern has been observed yet
        pattern = (*previously_seen[-n:len(previously_seen)], row['Word'])
        pattern = create_tuple_string_for_json(pattern)
        if pattern not in ngram_maps[n]:
            ngram_maps[n][pattern] = { row['POS']: 1 }
        else:
            if row['POS'] in ngram_maps[n][pattern]:
                ngram_maps[n][pattern][row['POS']] += 1
            else:
                ngram_maps[n][pattern][row['POS']] = 1

    # store the part of speech
    previously_seen.append(row['POS'])

In [51]:
ngram_maps

[{'Thousands': {'NNS': 114},
  'of': {'IN': 26352, 'RP': 2},
  'demonstrators': {'NNS': 110},
  'have': {'VBP': 4872, 'VB': 613},
  'marched': {'VBN': 17, 'VBD': 48},
  'through': {'IN': 515},
  'London': {'NNP': 261},
  'to': {'TO': 23027, 'IN': 139, 'CC': 47},
  'protest': {'VB': 114, 'NN': 123},
  'the': {'DT': 52572, 'NNP': 1},
  'war': {'NN': 720},
  'in': {'IN': 26315, 'RP': 8},
  'Iraq': {'NNP': 1738},
  'and': {'CC': 19936},
  'demand': {'VB': 45, 'NN': 175},
  'withdrawal': {'NN': 154},
  'British': {'JJ': 561, 'NNP': 59, 'NNS': 16, 'NN': 1},
  'troops': {'NNS': 1195},
  'from': {'IN': 4539},
  'that': {'DT': 347, 'IN': 3898, 'WDT': 2055, 'RB': 1},
  'country': {'NN': 1925},
  '.': {'.': 47761},
  'Families': {'NNS': 6},
  'soldiers': {'NNS': 757},
  'killed': {'VBN': 1978, 'VBD': 883},
  'conflict': {'NN': 244, 'VBP': 1},
  'joined': {'VBD': 91, 'VBN': 25},
  'protesters': {'NNS': 197},
  'who': {'WP': 1919},
  'carried': {'VBD': 101, 'VBN': 121},
  'banners': {'NNS': 11},
  

In [52]:
previously_seen

['JJ', 'NNS', 'VBD', 'PRP', 'VBD', 'TO', 'DT', 'NN']

In [53]:
import json

# TODO: need to replace tuple keys in order to do this for output
with open('models/sentence_1.json', 'w') as json_file:
    json.dump(ngram_maps, json_file, indent=4)