# Cleaning of the Google Taskmaster datasets

In [None]:
from num2words import num2words
import json
import re
import time
import numpy as np
import random

# Helper function that removes punctuation
def clean_data(s):
    punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'
    slist = s.split() 
    s = " ".join(x.translate(str.maketrans('', '', punct)).lower().strip() for x in slist if x != "")
    return ' '.join(s.split())
myv = np.vectorize(clean_data)

# Helper function to convert numbers to words
def cleanPM(w):
    t = []
    if ":" in w:
        w = w.split(":")
    for i in w:
        if i.isnumeric():
            t.append(num2words(int(i)).replace("-"," "))
    return t

In [None]:
#Loading all Taskmaster datasets
with open('woz-dialogs.json', 'r') as f:
    distros_dict = json.load(f)
    with open('self-dialogs.json', 'r') as file:
        temp = json.load(file)
        distros_dict.extend(temp)
with open('../TM-2-2020/data/food-ordering.json', 'r') as f:
    distros = json.load(f)
    
# Cleaning the TM2020 dataset. Mostly things like 1:00pm into one p m and $15.52 into fifteen fifty two to remain consistent with the test data.
tm2020 = []
for distro in distros:
    for line in distro['utterances']:
        temp = []
        for word in line['text'].split():
            word = word.lower()
            if word and word != "(deleted)":
                if re.search(r"\d+[:]{0,1}\d+pm", word):
                    temp.extend(cleanPM(word[:-2]))
                    temp.append("p m")
                elif word.isnumeric():
                    temp.append(num2words(int(word)).replace("-"," "))
                elif re.search(r"[!#$%&()+,-.\/:;<=>?@_`~']{0,2}\d+[!#$%&()+,-.\/:;<=>?@`~']{0,2}\d+[!#$%&()*+,-.\/:;<=>?@`~']{0,2}", word):
                    word = re.sub(r'[!"#$%&()\*,-./:;<=>?@^_`{|}~]', " ", word)
                    temp.extend(cleanPM(word))
                elif word.lower() == "pm" or word.lower() == "am":
                    temp.append("p m")
                else:
                    temp.append(word)
        tm2020.append(" ".join(temp))

In [10]:
lines = []

# Cleaning the TM2019 dataset. Same as TM2020
for distro in distros_dict:
    if 'pizza' in distro['instruction_id']:
        for dic in distro['utterances']:
            temp = []
            for word in dic['text'].split():
                word = word.lower()
                if word and word != "(deleted)":
                    if re.search(r"\d+[:]{0,1}\d+pm", word):
                        temp.extend(cleanPM(word[:-2]))
                        temp.append("p m")
                    elif word.isnumeric():
                        temp.append(num2words(int(word)).replace("-"," "))
                    elif re.search(r"[!#$%&()+,-.\/:;<=>?@_`~']{0,2}\d+[!#$%&()+,-.\/:;<=>?@`~']{0,2}\d+[!#$%&()*+,-.\/:;<=>?@`~']{0,2}", word):
                        word = re.sub(r'[!"#$%&()\*,-./:;<=>?@^_`{|}~]', " ", word)
                        temp.extend(cleanPM(word))
                    elif word.lower() == "pm" or word.lower() == "am":
                        temp.append("p m")
                    else:
                        temp.append(word)
            lines.append(" ".join(temp))

# Merging both datasets and removing punctuation
lines.extend(tm2020)
taskmaster = myv(lines)

In [11]:
cleantaskmaster = [taskmaster[0]]
for i in range(1, len(taskmaster)):
    if taskmaster[i] == cleantaskmaster[-1]:
        continue
    else:
        cleantaskmaster.append(taskmaster[i])
        
finaltaskmaster = []

# Removes all starting words our model was overpredicting
for line in cleantaskmaster:
    if len(line) == 2 and (line[:2] == "no" or line[:2] == "ok"):
        continue
    elif line[:3] == "yes" or line[:3] == "yep" or line[:3] == "and":
        finaltaskmaster.append(line[3:])
    elif line[:4] == "yeah" or line[:4] == "okay" or line[:4] == "sure":
        finaltaskmaster.append(line[4:])
    elif line[:3] == "no " or line[:3] == "ok ":
        finaltaskmaster.append(line[3:])
    else:
        finaltaskmaster.append(line)

# Removes empty lines
finaltaskmaster = [n for n in finaltaskmaster if n]

#Load the whole thing into a .txt file
with open("TM2.txt", "w") as outfile:
    for line in finaltaskmaster:
        outfile.write(line+"\n")

In [12]:
def pickUp(filename, cleanname):
    """Replaces all instances of 'pick up' to 'pickup' for consistency"""
    yeah = open(filename, "r")
    lines = []
    for line in yeah.readlines():
        crn = []
        words = line.split()
        found = False
        if len(words) < 1:
            crn.append("")
        else:
            for i in range(len(words)-1):
                if found:
                    found = False
                    continue
                if words[i] == "pick" and words[i+1] == "up":
                    crn.append("pickup")
                    found = True
                else:
                    crn.append(words[i])
            if words[-1] != "up":
                crn.append(words[-1])
        lines.append(crn)

    for i in range(len(lines)):
        lines[i] = " ".join(lines[i])
    out_file = open(cleanname, "w")
    out_file.write("\n".join(lines))


In [13]:
# Replacing all instances of 'pick up' to 'pickup' for consistency
pickUp('TM2.txt','TMpup2.txt')