In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_dataset = pd.read_csv('ner_dataset2.csv')
raw_dataset['Sentence Start'] = ~raw_dataset['Sentence #'].isna()
raw_dataset

Unnamed: 0,Sentence #,Word,POS,Tag,Sentence Start
0,Sentence: 1,Thousands,NNS,O,True
1,,of,IN,O,False
2,,demonstrators,NNS,O,False
3,,have,VBP,O,False
4,,marched,VBN,O,False
...,...,...,...,...,...
1048570,,they,PRP,O,False
1048571,,responded,VBD,O,False
1048572,,to,TO,O,False
1048573,,the,DT,O,False


In [3]:
# index 0 is unigram, 1 is bigram, 2 is trigram, 3 is quadgram
# the keys of each dictionary are effectively a path to the occurence
#   for a unigram, this would be simply the current word
#   for higher n-grams, this would be the sequence of previous POS with the current word
# the values of each dictionary are also dictionaries
#   the key is the observed type
#   the value is the number of times this POS occurred
ngram_maps = [{}, {}, {}, {}]

sentences = 1

previously_seen = []
for index, row in raw_dataset.iterrows():
    # reset to having seen nothing if new sentence starts
    if row['Sentence Start']:
        
        # TODO: remove this
        if sentences <= 0:
            break 
        else:
            sentences -= 1

        previously_seen = []
    
    # unigram
    if row['Word'] not in ngram_maps[0]:
        ngram_maps[0][row['Word']] = { row['POS']: 1 }
    else:
        if row['POS'] in ngram_maps[0][row['Word']]:
            ngram_maps[0][row['Word']][row['POS']] += 1
        else:
            ngram_maps[0][row['Word']][row['POS']] = 1

    # higher n-grams
    for n in range(1, len(ngram_maps)):
        # skip higher grams if not enough previously seen words in this sentence
        if len(previously_seen) < n:
            break

        # unpacks the last n elements into a tuple with word, then check if that pattern has been observed yet
        if (*previously_seen[-n:len(previously_seen)], row['Word']) not in ngram_maps:
            ngram_maps[n][(*previously_seen[-n:len(previously_seen)], row['Word'])] = { row['POS']: 1 }
        else:
            if row['POS'] in ngram_maps[n][(*previously_seen[-n:len(previously_seen)], row['Word'])]:
                ngram_maps[n][(*previously_seen[-n:len(previously_seen)], row['Word'])][row['POS']] += 1
            else:
                ngram_maps[n][(*previously_seen[-n:len(previously_seen)], row['Word'])][row['POS']] = 1

    # store the part of speech
    previously_seen.append(row['POS'])

In [4]:
ngram_maps

[{'Thousands': {'NNS': 1},
  'of': {'IN': 2},
  'demonstrators': {'NNS': 1},
  'have': {'VBP': 1},
  'marched': {'VBN': 1},
  'through': {'IN': 1},
  'London': {'NNP': 1},
  'to': {'TO': 1},
  'protest': {'VB': 1},
  'the': {'DT': 2},
  'war': {'NN': 1},
  'in': {'IN': 1},
  'Iraq': {'NNP': 1},
  'and': {'CC': 1},
  'demand': {'VB': 1},
  'withdrawal': {'NN': 1},
  'British': {'JJ': 1},
  'troops': {'NNS': 1},
  'from': {'IN': 1},
  'that': {'DT': 1},
  'country': {'NN': 1},
  '.': {'.': 1}},
 {('NNS', 'of'): {'IN': 1},
  ('IN', 'demonstrators'): {'NNS': 1},
  ('NNS', 'have'): {'VBP': 1},
  ('VBP', 'marched'): {'VBN': 1},
  ('VBN', 'through'): {'IN': 1},
  ('IN', 'London'): {'NNP': 1},
  ('NNP', 'to'): {'TO': 1},
  ('TO', 'protest'): {'VB': 1},
  ('VB', 'the'): {'DT': 1},
  ('DT', 'war'): {'NN': 1},
  ('NN', 'in'): {'IN': 1},
  ('IN', 'Iraq'): {'NNP': 1},
  ('NNP', 'and'): {'CC': 1},
  ('CC', 'demand'): {'VB': 1},
  ('DT', 'withdrawal'): {'NN': 1},
  ('NN', 'of'): {'IN': 1},
  ('IN', '

In [5]:
previously_seen

['NNS',
 'IN',
 'NNS',
 'VBP',
 'VBN',
 'IN',
 'NNP',
 'TO',
 'VB',
 'DT',
 'NN',
 'IN',
 'NNP',
 'CC',
 'VB',
 'DT',
 'NN',
 'IN',
 'JJ',
 'NNS',
 'IN',
 'DT',
 'NN',
 '.']

In [None]:
import json

# TODO: need to replace tuple keys in order to do this for output
with open('models/sentence_1.json', 'w') as json_file:
    json.dump(ngram_maps, json_file, indent=4)