In [None]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
import re
import nltk
from nltk.tokenize import PunktSentenceTokenizer

# word_tokenize is based on TreeBankWordTokenizer which uses Regular Expressions (https://www.nltk.org/_modules/nltk/tokenize.html#word_tokenize , https://www.nltk.org/_modules/nltk/tokenize/treebank.html#TreebankWordTokenizer)
# PunktSentenceTokenizer uses an unsupervised algorithm (https://www.nltk.org/api/nltk.tokenize.PunktSentenceTokenizer.html)

Prepares preprocessed sequences to be POS-tagged with anchor and cleaned afterwards, actual POS-tagging occurs in WSL (linux)


In [None]:
with open('combined.dev') as infile:
  combined = infile.read()

comb_sequences = combined.split("\n")

In [None]:
# train tokenizer on dev set
sent_tokenizer = PunktSentenceTokenizer(combined)

In [None]:
# tokenize dev set

full_string = ''

for seq in comb_sequences:
  sentences = sent_tokenizer.tokenize(seq)
  for sentence in sentences:
    tagged = pos_tag(word_tokenize(sentence), tagset='universal')
    tokens = ''
    posses = ''
    for (token, pos) in tagged:
      tokens += token + ' '
      posses += pos + ' '
    full_string += tokens + "\t" + posses + "\n"

In [None]:
with open('tagged_combined.dev', 'w') as outfile:
  outfile.write(full_string)

In [None]:
with open('combined.train') as infile:
  combined = infile.read()

comb_sequences = combined.split("\n")

In [None]:
# train tokenizer on train set
sent_tokenizer = PunktSentenceTokenizer(combined)

In [None]:
# tokenize preprocessed sequences

full_string = ''

for seq in comb_sequences:
  sentences = sent_tokenizer.tokenize(seq)
  for sentence in sentences:
    tokenized = word_tokenize(sentence)
    tokens = " ".join(tokenized)
    full_string += tokens + "\n"

with open('tokenized_combined.train', 'w') as outfile:
  outfile.write(full_string)

In [None]:
# prepare tokenized sequences to be POS-tagged

with open('tokenized_combined.train') as infile:
  combined = infile.read().split("\n")

full_text = ''
for seq in combined:
  tokens = seq.split(" ")
  for token in tokens:
    full_text += (token + "__<label>__N ")

  full_text += "\n"

with open('tagged_combined.train', 'w') as outfile:
  outfile.write(full_text)

In [None]:
# manually assigned POS tag per anchor word

tag_to_pos = { "let" : "VERB",
 "MCH" :"X",
 "are" : "VERB",
 "any" : "DET",
 "we" : "PRON",
 "?" :".",
 "in" : "ADP",
 "anything" : "NOUN",
 "used" : "VERB",
 "*" : ".",
 "house" : "NOUN",
 "A" : "DET",
}

In [None]:
def cleanup(filepath):
  '''
  Take POS-tagged sequences, replace anchor words with POS tags, save to file
  '''

  with open(filepath) as infile:
    sequences = infile.read().split("\n\n")

  cleaned_sequences = []
  for seq in sequences:
    words = []
    pos = []
    tokens = seq.split("\n")
    for token in tokens:
      tags = token.split(" ")
      if len(tags) == 4:
        words.append(tags[0])
        pos.append(tag_to_pos[tags[3]])
    clean_string = " ".join(words) + "\t" + " ".join(pos)
    cleaned_sequences.append(clean_string)

  new_filepath = 'clean_' + filepath
  with open(new_filepath, 'w') as outfile:
    outfile.write("\n".join(cleaned_sequences))

In [None]:
cleanup("pred3")