In [None]:
import re
import nltk
from nltk.tokenize import PunktSentenceTokenizer

In [None]:
def check_divisible(numb):
  '''
  check divisibility of given number, if none return 5
  '''
  if numb % 4 == 0:
    return 4
  if numb % 5 == 0:
    return 5
  if numb % 6 == 0:
    return 6
  if numb % 7 == 0:
    return 7
  if numb % 8 == 0:
    return 8
  if numb % 3 == 0:
    return 3
  if numb % 2 == 0:
    return 2
  if numb % 9 == 0:
    return 9
  if numb % 10 == 0:
    return 10
  return 5

In [None]:
def chunker(seq, size):
  '''
  split seq into chunks of equal size (and rest)
  '''
  return (seq[pos:pos + size] for pos in range(0, len(seq), size))

In [None]:
def paragraphs_to_sequences(subset, sent_tokenizer):
  '''
  Splits paragraphs into sequences
  If less than 6 sentences: one sequence
  if between 6 and 20 sentences: two sequences
  If more: Preferably equal size sequences (using check_divisible)

  Returns a list of sequences
  '''
  sequences = []


  for text in subset:
    if len(text) <= 2:
    #if len(text.split(" ")) < 1:
      continue

    elif len(text.split("\n")) <= 10:
      sequences.append(text)


    elif len(text.split("\n")) > 10:
      tokenized = sent_tokenizer.tokenize(text)

      if len(tokenized) <= 6:
        sequences.append(" ".join(tokenized))

      elif 6 < len(tokenized) <= 20:
        half = int(len(tokenized)/2)
        seq1 = tokenized[:half]
        seq2 = tokenized[half:]
        sequences.append(" ".join(seq1))
        sequences.append(" ".join(seq2))


      elif len(tokenized) > 20:
        numb = check_divisible(len(tokenized))
        for seq in chunker(tokenized, numb):
          sequences.append(" ".join(seq))

  final_sequences = []
  for seq in sequences:
    seqnew = seq.replace("\n", " ")
    final_sequences.append(seqnew)

  return final_sequences

In [None]:
def end_punct(dataset):
  '''
  replace space+punctuation with punctuation
  delete '- ' at start of line
  '''

  prep = []
  for line in dataset:
    if len(line) > 3:
      if line.startswith("- "):
        line = line.replace("- ", "")
      if len(line) > 3:
        line = re.sub(" [\.\?\!]$", line[-1], line)
      line = line.strip()
      if len(line) > 3:
        if line[-1] not in ['!', '?' ,'.' , ',', ':' ,' ;', '\'', '\"', "-", ")" ]:
          line += "."
    prep.append(line)

  return prep

In [None]:
def wiki_prep(dataset):
  '''
  Simple Wiki Preprocessing: takes list of lines and returns preprocessed list of lines
  remove tabs and '===' markers
  '''
  prep_wikipedia = []
  for line in dataset:
    if line.startswith("="):
      continue
    elif line.startswith(" ="):
      continue
    else:
      line = line.replace("\t", "")
      prep_wikipedia.append(line)

  return prep_wikipedia

In [None]:
def switch_prep(dataset):
  '''
  Switchboard Preprocessing: takes list of lines and returns preprocessed list of lines
  remove speaker (A/B) markers
  '''
  prep = []
  for line in dataset:
    if line.startswith("B:	"):
      line = line.replace("B:	", "")
    elif line.startswith("A:	"):
      line = line.replace("A:	","")
    prep.append(line)

  return(prep)

In [None]:

def split_sents(dataset):
  '''
  Join 5 sentences/lines together to form a sequence
  Takes a dataset and returns list of sequences
  '''
  sequences = []
  size = 5
  lines = dataset.split("\n")
  for seq in chunker(lines, size):
    sequences.append(" ".join(seq))

  return sequences



In [None]:
def prep_childes(dataset):
  '''
  Childes Preprocessing: takes list of lines and returns preprocessed list of lines
  removes speaker markers
  capitalizes if not capitalized
  '''
  prep = []
  for line in dataset:
    line = line.replace("*CHI:	", "")
    line = line.replace("*MOT:	", "")
    line = line.replace("*COL:	", "")
    if len(line) >= 3:
      if line[0].isalpha() and line[0].islower():
        line = line.capitalize()
    prep.append(line)

  prepped = end_punct(prep)
  prepped2 = wiki_prep(prepped)

  return prepped2

In [None]:
def prep_gutenberg(dataset):
  '''
  Gutenberg Preprocessing: takes list of lines and returns preprocessed list of lines
  removes chapter markers and formatting punctuation
  '''
  prep = []
  for line in dataset:
    if line.startswith("*      *"):
      line = "\n"
    if line.startswith("CHAPTER"):
      line = "\n"
    if line.startswith("*CHAPTER"):
      line = "\n"
    line.replace("=", " ")
    line.replace("_", " ")
    prep.append(line)
  return prep

### Training set

In [None]:
# BNC_SPOKEN preprocessing

with open('data/train/bnc_spoken.train') as infile:
  bnc = infile.read().split("\n")

prep_bnc = end_punct(bnc)

with open('data/train/prep_bnc_spoken.train', "w") as outfile:
   outfile.write("\n".join(prep_bnc))

In [None]:
# BNC_SPOKEN splitting

with open('data/train/prep_bnc_spoken.train') as infile:
  bnc = infile.read()

sequences = split_sents(bnc)

with open('data/train/seq_bnc_spoken.train', "w") as outfile:
   outfile.write("\n".join(sequences))

In [None]:
# OPEN_SUBTITLES preprocessing

with open('data/train/open_subtitles.train') as infile:
  opensubs = infile.read().split("\n")

prep_subs = end_punct(opensubs)

with open('data/train/prep_open_subtitles.train', "w") as outfile:
   outfile.write("\n".join(prep_subs))

In [None]:
# OPEN_SUBTITLES splitting

with open('data/train/prep_open_subtitles.train') as infile:
  subs = infile.read()

sequences = split_sents(subs)

with open('data/train/seq_open_subtitles.train', "w") as outfile:
   outfile.write("\n".join(sequences))

In [None]:
# SIMPLE_WIKI preprocessing

with open('data/train/simple_wiki.train') as infile:
  wiki = infile.read().split("\n")

prep_wiki = wiki_prep(wiki)

with open('data/train/prep_simple_wiki.train', "w") as outfile:
   outfile.write("\n".join(prep_wiki))

In [None]:
# SIMPLE_WIKI splitting

with open('data/train/prep_simple_wiki.train') as infile:
  full_wikipedia = infile.read()
  wikipedia = full_wikipedia.split("\n\n")

wiki_tokenizer = PunktSentenceTokenizer(full_wikipedia)

sequences = paragraphs_to_sequences(wikipedia, wiki_tokenizer)

with open('data/train/seq_simple_wiki.train', 'w') as outfile:
  outfile.write("\n".join(sequences))

In [None]:
# SWITCHBOARD preprocessing

with open('data/train/switchboard.train') as infile:
  switch = infile.read().split("\n")

prep_switch = switch_prep(switch)

with open('data/train/prep_switchboard.train', "w") as outfile:
   outfile.write("\n".join(prep_switch))

In [None]:
# SWITCHBOARD splitting

with open('data/train/prep_switchboard.train') as infile:
  switch = infile.read()

sequences = split_sents(switch)

with open('data/train/seq_switchboard.train', "w") as outfile:
   outfile.write("\n".join(sequences))

In [None]:
# CHILDES preprocessing

with open('data/train/childes.train') as infile:
  childes = infile.read().split("\n")

prepped = prep_childes(childes)

with open('data/train/prep_childes.train', 'w') as outfile:
  outfile.write("\n".join(prepped))

In [None]:
# CHILDES splitting

with open('data/train/prep_childes.train') as infile:
  childes = infile.read().split("\n\n")

sequences = []
for convo in childes:
  seqs = split_sents(convo)
  sequences.extend(seqs)

with open('data/train/seq_childes.train', 'w') as outfile:
  outfile.write("\n".join(sequences))

In [None]:
# GUTENBERG preprocessing

with open('data/train/gutenberg.train') as infile:
  gutenberg = infile.read().split("\n")

prep = prep_gutenberg(gutenberg)

with open('data/train/prep_gutenberg.train', 'w') as outfile:
  outfile.write("\n".join(prep))

In [None]:
# GUTENBERG splitting

with open('data/train/prep_gutenberg.train') as infile:
  full_gutenberg = infile.read()

gut = full_gutenberg.split("\n\n\n")
gutenberg = []
for x in gut:
  gutenberg.extend(x.split("\n\n"))


gut_tokenizer = PunktSentenceTokenizer(full_gutenberg)

sequences = paragraphs_to_sequences(gutenberg, gut_tokenizer)

with open('data/train/seq_gutenberg.train', 'w') as outfile:
  outfile.write("\n".join(sequences))

In [None]:
# combine all preprocessed+split datasets into one file

combined = []

all_texts = ['data/train/seq_switchboard.train', 'data/train/seq_bnc_spoken.train', 'data/train/seq_childes.train', 'data/train/seq_gutenberg.train', 'data/train/seq_open_subtitles.train', 'data/train/seq_simple_wiki.train']

for text in all_texts:
  with open(text) as infile:
    sequences = infile.read().split("\n")
    combined.extend(sequences)

with open('data/train/combined.train', 'w') as outfile:
  outfile.write("\n".join(combined))

### Dev Set

In [None]:
# BNC_SPOKEN preprocessing

with open('data/dev/bnc_spoken.dev') as infile:
  bnc = infile.read().split("\n")

prep_bnc = end_punct(bnc)

with open('data/dev/prep_bnc_spoken.dev', "w") as outfile:
   outfile.write("\n".join(prep_bnc))

In [None]:
# BNC_SPOKEN splitting

with open('data/dev/prep_bnc_spoken.dev') as infile:
  bnc = infile.read()

sequences = split_sents(bnc)

with open('data/dev/seq_bnc_spoken.dev', "w") as outfile:
  outfile.write("\n".join(sequences))

In [None]:
# OPEN_SUBTITLES preprocessing

with open('data/dev/open_subtitles.dev') as infile:
  opensubs = infile.read().split("\n")

prep_subs = end_punct(opensubs)

with open('data/dev/prep_open_subtitles.dev', "w") as outfile:
   outfile.write("\n".join(prep_subs))

In [None]:
# OPEN_SUBTITLES splitting

with open('data/dev/prep_open_subtitles.dev') as infile:
  subs = infile.read()

sequences = split_sents(subs)

with open('data/dev/seq_open_subtitles.dev', "w") as outfile:
   outfile.write("\n".join(sequences))

In [None]:
# SIMPLE_WIKI preprocessing

with open('data/dev/simple_wiki.dev') as infile:
  wiki = infile.read().split("\n")

prep_wiki = wiki_prep(wiki)

with open('data/dev/prep_simple_wiki.dev', "w") as outfile:
   outfile.write("\n".join(prep_wiki))

In [None]:
# SIMPLE_WIKI splitting

with open('data/dev/prep_simple_wiki.dev') as infile:
  full_wikipedia = infile.read()
  wikipedia = full_wikipedia.split("\n\n")

wiki_tokenizer = PunktSentenceTokenizer(full_wikipedia)

sequences = paragraphs_to_sequences(wikipedia, wiki_tokenizer)

with open('data/dev/seq_simple_wiki.dev', 'w') as outfile:
  outfile.write("\n".join(sequences))

In [None]:
# SWITCHBOARD preprocessing

with open('data/dev/switchboard.dev') as infile:
  switch = infile.read().split("\n")

prep_switch = switch_prep(switch)

with open('data/dev/prep_switchboard.dev', "w") as outfile:
   outfile.write("\n".join(prep_switch))

In [None]:
# SWITCHBOARD splitting

with open('data/dev/prep_switchboard.dev') as infile:
  switch = infile.read()

sequences = split_sents(switch)

with open('data/dev/seq_switchboard.dev', "w") as outfile:
   outfile.write("\n".join(sequences))

In [None]:
# CHILDES preprocessing

with open('data/dev/childes.dev') as infile:
  childes = infile.read().split("\n")

prepped = prep_childes(childes)

with open('data/dev/prep_childes.dev', 'w') as outfile:
  outfile.write("\n".join(prepped))

In [None]:
# CHILDES splitting

with open('data/dev/prep_childes.dev') as infile:
  childes = infile.read().split("\n\n")

sequences = []
for convo in childes:
  seqs = split_sents(convo)
  sequences.extend(seqs)

with open('data/dev/seq_childes.dev', 'w') as outfile:
  outfile.write("\n".join(sequences))

In [None]:
# GUTENBERG preprocessing

with open('data/dev/gutenberg.dev') as infile:
  gutenberg = infile.read().split("\n")

prep = prep_gutenberg(gutenberg)

with open('data/dev/prep_gutenberg.dev', 'w') as outfile:
  outfile.write("\n".join(prep))

In [None]:
# GUTENBERG splitting

with open('data/dev/prep_gutenberg.dev') as infile:
  full_gutenberg = infile.read()

gut = full_gutenberg.split("\n\n\n")
gutenberg = []
for x in gut:
  gutenberg.extend(x.split("\n\n"))


gut_tokenizer = PunktSentenceTokenizer(full_gutenberg)

sequences = paragraphs_to_sequences(gutenberg, gut_tokenizer)

with open('data/dev/seq_gutenberg.dev', 'w') as outfile:
  outfile.write("\n".join(sequences))

In [None]:
# combine all preprocessed+split datasets into one file

combined = []

all_texts = ['data/dev/seq_switchboard.dev', 'data/dev/seq_bnc_spoken.dev', 'data/dev/seq_childes.dev', 'data/dev/seq_gutenberg.dev', 'data/dev/seq_open_subtitles.dev', 'data/dev/seq_simple_wiki.dev']

for text in all_texts:
  with open(text) as infile:
    sequences = infile.read().split("\n")
    combined.extend(sequences)

with open('data/dev/combined.dev', 'w') as outfile:
  outfile.write("\n".join(combined))